#library reticulate to be able to add chunks in another language
#library(reticulate)
#To run this .rmd file in a terminal being in the project folder:
#export PATH=${PATH}:/cm/shared/apps/R/deps/rstudio/bin/pandoc
#file="./report/Variant_to_Gene_Report.Rmd"
#Rscript -e 'rmarkdown::render("'$file'")'

Rationale

Identify and prioritise genes using variant-to-gene mapping tools with credible sets variants from fine-mapping results of GWAS on severe asthma in UK Biobank European individuals.
Master Code (overall pipeline): src/Var_to_Gene_pipeline.sh

#!/usr/bin/env bash

#Rationale: pipeline for Variant to Gene mapping analysis. Output from each analysis: list of genes



module load R
#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"


################
#1.VARIANT ANNOTATION
################
#From FAVOR website: Two Info:
#Nearest gene for sentinel with highest PIP in each locus
#Functional annotated credset variants according to FANTOM5, ClinVar and Integrative Functional Score criteria
Rscript src/Variant_annotation_FAVOR.R input/FAVOR_credset_chrpos38_2023_08_08.csv
#Add nearest genes to the var2gene_raw.xlsx file.


################
#2.1 COLOCALISATION - eQTL
#GTExV8, eQTLGen, UBCLung
################
##Exclude chromosome 6 - no eQTL colocalisation for this locus.
##Genomic boundaries: PIP-max causal variant +/- 1Mb

cs=('SA_8_81292599_C_A' 'SA_6_90963614_AT_A' 'SA_5_110401872_C_T' 'SA_2_242692858_T_C' 'SA_15_67442596_T_C' 'SA_12_56435504_C_G' 'SA_11_76296671_G_A' 'SA_9_6209697_G_A' 'SA_5_131885240_C_G' 'SA_3_33042712_C_T' 'SA_2_102913642_AAAAC_A' 'SA_17_38168828_G_A' 'SA_16_27359021_C_A'  'SA_15_61068954_C_T' 'SA_12_48202941_T_C' 'SA_10_9064716_C_T')
cs_all="/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt"

###GTExv8 eQTL###

##Create eQTl files in hg19 for Colon Transverse, Colon Sigmoid, Skin_Not_Sun_Exposed_Suprapubic, Skin_Sun_Exposed_Lower_leg
#From GTExV8 .parquet files and in hg38.
mkdir ${tmp_path}/liftover_gtexv8
mkdir ${tmp_path}/liftover_gtexv8/bed

dos2unix src/coloc/000A_eqtl_gtex_extraction.R src/coloc/000B_eqtl_gtex_liftover.sh \
    src/coloc/000C_eqtl_gtex_conversion.R src/coloc/000A_submit_eqtl_gtex_extraction.sh src/coloc/000C_submit_eqtl_gtex_conversion.sh
chmod o+x src/coloc/000A_eqtl_gtex_extraction.R src/coloc/000B_eqtl_gtex_liftover.sh \
    src/coloc/000C_eqtl_gtex_conversion.R src/coloc/000A_submit_eqtl_gtex_extraction.sh src/coloc/000C_submit_eqtl_gtex_conversion.sh

sbatch --array=1-22 src/coloc/000A_submit_eqtl_gtex_extraction.sh

sbatch src/coloc/000B_eqtl_gtex_liftover.sh

sbatch --array=1-22 src/coloc/000C_submit_eqtl_gtex_extraction.sh


##variables needed:
tissue=('Stomach' 'Small_Intestine_Terminal_Ileum' 'Lung' 'Esophagus_Muscularis' 'Esophagus_Gastroesophageal_Junction' 'Artery_Tibial' 'Artery_Coronary' 'Artery_Aorta' 'Colon_Transverse' 'Colon_Sigmoid' 'Skin_Sun_Exposed_Lower_leg' 'Skin_Not_Sun_Exposed_Suprapubic')

##Divide the credible sets into separate files:
Rscript ./src/coloc/000_preprocess_cs.R $cs_all $tmp_path

#Obtain GWASpairs from credible set regions:
#.sh will run .R script:
for c in ${!cs[*]}; do

  sbatch --export=CREDSET="${cs[c]}" ./src/coloc/001_submit_GWASpairs.sh

done

#Create files for GTExV8:
##'Colon Transverse' and 'Colon Sigmoid' miss from /data/gen1/ACEI/colocalisation_datasets/eQTL/GTeX/
##Need to create these
#.sh will run .R script:
for t in ${!tissue[*]}; do

  for c in ${!cs[@]}; do

    sbatch --export=TISSUE="${tissue[t]}",CREDSET="${cs[c]}" ./src/coloc/001_submit_eqtl_lookup_GTEx.sh

  done

done


##Get the LD matrix:
##Create the file with gtex-locus pairs:
for t in ${!tissue[*]}; do Rscript ./src/coloc/002_prepare_LDinput.R "${tissue[t]}"; done

##Get LD:
#with parameters for GTExV8
sbatch ./src/coloc/002_get_LD.sh

##to see if errors in the job: grep "Error" /scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/ld-81713.out
##after adding Colon and Skin tissues, to see if errors in the job: grep "Error" /scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/ld-153659.out

##Run the colocalisation for GTExV8:
#.sh will run .R script:
mkdir ${tmp_path}/results
mkdir ${tmp_path}/results/gtex
#run for each Tissue:
tissue='Stomach'
tissue='Small_Intestine_Terminal_Ileum'
tissue='Lung'
tissue='Esophagus_Muscularis'
tissue='Esophagus_Gastroesophageal_Junction'
tissue='Artery_Tibial'
tissue='Artery_Coronary'
tissue='Artery_Aorta'
tissue='Colon_Transverse'
tissue='Colon_Sigmoid'
tissue='Skin_Sun_Exposed_Lower_leg'
tissue='Skin_Not_Sun_Exposed_Suprapubic'

for c in ${!cs[*]}; do

  N=`cat ${tmp_path}${cs[c]}_${tissue}_genes.txt | wc -l`

  sbatch --array=1-${N}%20 --export=TISSUE="${tissue}",CREDSET="${cs[c]}" ./src/coloc/003_submit_coloc_susie_GTEx.sh

 sleep 5

done

######QUALITY CHECKS:
#to find number of genes:
#wc -l SA_*_${tissue}_genes.txt | sed 's/_/ /g' | sort -k 3,4 -g | awk '{print $1}'
#'Stomach' 575
#'Small_Intestine_Terminal_Ileum' 635
#'Lung' 633
#'Esophagus_Muscolaris' 564
#'Esophagus_Gastroesophageal_Junction' 576
#tissue='Artery_Tibial' 573
#tissue='Artery_Coronary' 603
#tissue='Artery_Aorta' 586
#tissue='Colon_Transverse' 610
#tissue='Colon_Sigmoid' 585
#tissue='Skin_Sun_Exposed_Lower_leg' 673
#tissue='Skin_Not_Sun_Exposed_Suprapubic' 670

##Check that all genes for each tissue have been analysed:
grep ${tissue} ${tmp_path}/logerror/coloc_susie_gtex*.out | awk -F ":" '{print $1}' | sort -u | wc -l

##Check how many genes per tissue have been analysed for colocalisation:
ls -lthr  ${tmp_path}/results/gtex/*all_coloc.rds | grep ${tissue} | wc -l
ls -lthr ${tmp_path}/results/gtex/*all_susie*.rds | grep ${tissue} | wc -l

#gtex converted in gene symbol:
#https://www.biotools.fr/human/ensembl_symbol_converter
#added in GTExV8_eQTL_genes_symbol table in the input/var2gene.xlsx file.

awk 'NR ==1; $11 == "TRUE" {print $0}' ${tmp_path}/results/coloc_asthma_GTEx.tsv \
    > output/coloc_asthma_GTEx.tsv

awk 'NR ==1; $16 == "TRUE" {print $0}' ${tmp_path}/results/colocsusie_asthma_GTEx.tsv \
    > output/colocsusie_asthma_GTEx.tsv

###eqtlGen eQTL###
mkdir ${tmp_path}/results/eqtlgen
mkdir ${tmp_path}/eqtlgen
dos2unix src/coloc/000_submit_edit_eQTLGen.sh src/coloc/000_run_edit_eQTLGen.R
chmod +x src/coloc/000_submit_edit_eQTLGen.sh src/coloc/000_run_edit_eQTLGen.R
sbatch src/coloc/000_submit_edit_eQTLGen.sh

#Create files for eQTLGen colocalisation:
dos2unix src/coloc/001_submit_eqtl_lookup_eQTLGen.sh src/coloc/001_run_eqtl_lookup_eQTLGen.R
chmod +x src/coloc/001_submit_eqtl_lookup_eQTLGen.sh src/coloc/001_run_eqtl_lookup_eQTLGen.R

for c in ${!cs[@]}; do

    sbatch --export=CREDSET="${cs[c]}" ./src/coloc/001_submit_eqtl_lookup_eQTLGen.sh

done

##Get the LD matrix:
##Create the file with gtex-locus pairs:
dos2unix src/coloc/002_prepare_LDinput_eqtlgen.R
chmod +x src/coloc/002_prepare_LDinput_eqtlgen.R
Rscript src/coloc/002_prepare_LDinput_eqtlgen.R "eqtlGenWB"

##Get LD:
#with parameters for eqtlGen
sbatch src/coloc/002_get_LD.sh
##to see if errors in the job: grep "Error" /scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/ld-170767.out

##Run the colocalisation for GTExV8:
#.sh will run .R script:
mkdir ${tmp_path}/results/eqtlgen
dos2unix src/coloc/003_submit_coloc_susie_eQTLGen.sh
chmod +x src/coloc/003_run_coloc_susie_eQTLGen.R
for c in ${!cs[*]}; do

  N=`cat ${tmp_path}eqtlgen/${cs[c]}_eqtlGenWB_genes.txt | wc -l`

  sbatch --array=1-${N}%20 --export=CREDSET="${cs[c]}" ./src/coloc/003_submit_coloc_susie_eQTLGen.sh

 sleep 5

done

######QUALITY CHECKS:
#to find number of genes: 468
wc -l ${tmp_path}/eqtlgen/SA_*_eqtlGenWB_genes.txt | sed 's/_/ /g' | sort -k 3,4 -g | awk '{print $1}'
##Check that all genes for each tissue have been analysed:
grep "eqtlGenWB" ${tmp_path}/logerror/coloc_susie_eqtlgen*.out | awk -F ":" '{print $1}' | sort -u | wc -l

##Check how many genes per tissue have been analysed for colocalisation:
ls -lthr  ${tmp_path}/results/eqtlgen/*all_coloc.rds | grep "eqtlGenWB" | wc -l
ls -lthr ${tmp_path}/results/eqtlgen/*all_susie*.rds | grep "eqtlGenWB" | wc -l


awk 'NR ==1; $11 == "TRUE" {print $0}' /scratch/gen1/nnp5/Var_to_Gen_tmp/results/coloc_asthma_eqtlgen.tsv \
    > output/coloc_asthma_eqtlgen.tsv

#no true results fo rcolocsusie in eqtlGen


#Find statistically significant colocalisation results for GTExV8 and eqtlGen eQTL, and add results into var2gene_raw.xlsx:
Rscript ./src/coloc/004_concat_coloc_results.R


###UBCLung eQTL###
mkdir ${tmp_path}/results/ubclung
mkdir ${tmp_path}/ubclung
dos2unix src/coloc_UBClung/*
chmod +x src/coloc_UBClung/*
for c in ${!cs[*]}; do

  sbatch --export=CREDSET="${cs[c]}" ./src/coloc_UBClung/000_submit_lookup_lung_eQTL.sh

done


##Get the LD matrix:
##Create the file with gtex-locus pairs:
dos2unix src/coloc_UBClung/002_prepare_LDinput_ubclung.R
chmod +x src/coloc_UBClung/002_prepare_LDinput_ubclung.R
Rscript src/coloc_UBClung/002_prepare_LDinput_ubclung.R "UBCLung"

##Get LD:
#with parameters for UBCLung
sbatch src/coloc/002_get_LD.sh
##to see if errors in the job: grep "Error" /scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/ld-295744.out

##Create UBCLung eQTL regional data from eQTL summary stats:
#mkdir ${tmp_path}/ubclung/eQTL_region_stat/
module load tabix
lung_eQTL="/data/gen1/reference/lung_eQTL"

for c in ${!cs[@]}
    do
      CREDSET="${cs[c]}"
      read chr < <(echo $CREDSET | awk -F "_" '{print $2}')
      read pos < <(echo $CREDSET | awk -F "_" '{print $3}')
      pos1=`expr $pos - 1000000`
      pos2=`expr $pos + 1000000`
      if (( pos1 < 0 )); then
          pos1=0
      fi
      tabix -h ${lung_eQTL}/METAANALYSIS_Laval_UBC_Groningen_chr${chr}_formatted.txt.gz ${chr}:${pos1}-${pos2} > ${tmp_path}/ubclung/eQTL_region_stat/${CREDSET}_eQTL_region.txt
done

##Run colocalisation:
dos2unix src/coloc_UBClung/003_submit_coloc_susie_lung_eQTL.sh
dos2unix src/coloc_UBClung/003_run_coloc_susie_lung_eQTL.r
chmod +x src/coloc_UBClung/003_submit_coloc_susie_lung_eQTL.sh
chmod +x src/coloc_UBClung/003_run_coloc_susie_lung_eQTL.r

for c in ${!cs[*]}; do

  N=`cat ${tmp_path}ubclung/${cs[c]}_UBCLung_probesets.txt | wc -l`

  sbatch --array=1-${N}%20 --export=CREDSET="${cs[c]}" ./src/coloc_UBClung/003_submit_coloc_susie_lung_eQTL.sh

 sleep 5

done

######QUALITY CHECKS:
#to find number of genes: 468
wc -l ${tmp_path}/ubclung/SA_*_UBCLung_probesets.txt | sed 's/_/ /g' | sort -k 3,4 -g | awk '{print $1}'
##Check that all genes for each tissue have been analysed:
grep "UBClung" ${tmp_path}/logerror/coloc_susie_UBCLung*.out | awk -F ":" '{print $1}' | sort -u | wc -l

##Check how many genes per tissue have been analysed for colocalisation:
ls -lthr  ${tmp_path}/results/ubclung/*all_coloc.rds | grep "ubclung" | wc -l
ls -lthr ${tmp_path}/results/ubclung/*all_susie*.rds | grep "UBCLung" | wc -l

awk 'NR ==1; $13 == "TRUE" {print $0}' ${tmp_path}/results/coloc_asthma_ubclung.tsv \
    > output/coloc_asthma_ubclung.tsv
#no TRUE colocsusie resutls for UBCLung


#Find statistically significant colocalisation results for UBCLung, and add results into var2gene_raw.xlsx:
#R gave error for xlsx and Java, so I find statistically significant colocalisation results for GTExV8 and eqtlGen eQTL, and add results into var2gene_raw.xlsx:
Rscript ./src/coloc_UBClung/004_concat_coloc_results_ubclung.R

#Added genes into Var_to_Gene/input/var2genes_raw.xlsx file.

#Merge all the gene from eQTL colocalisation:
Rscript src/merge_genes_eqtl_coloc.R

################
#2.2 APPROXIMATE COLOCALISATION - pQTL (LOOK-UP FOR ALL CREDIBLE SET VARIANTS)
#UKB, SCALLOP, deCODE
################
##Exclude chromosome 6 - no eQTL colocalisation for this locus.
##Genomic boundaries: PIP-max causal variant +/- 1Mb

###UKBIOBANK pQTL LOOK-UP###
#Nick generate the pQTL dataset, as for /data/gen1/UKBiobank/olink/pQTL/README.txt:
# OLINK pQTL analysis
#* 48,195 European samples (as defined in Shrine et al. 2023)
#* 1463 proteins (proteins.txt)
#* Phenotype: untransformed log2 fold protein levels (olink_pheno.txt)
#* Covariates: olink batch, age, sex, genotyping array, PC1-10 (olink_covar.txt)
#* 44.8 million MAC >= 5 variants from HRC+UK10K imputation (/data/ukb/imputed_v3)
#* Additive model with regenie (run_step1.sh & run_step2.sh)
#* Full results for each protein tabixed in results directory
#* Script pqtl_lookup.sh for look ups, run with no arguments for usage info

# Nick created a look up script for UKB pQTL:
#/data/gen1/UKBiobank/olink/pQTL/pqtl_lookup.sh -s <RSID> -r <CHR:START-END> [ -f <PROTEINS FILE> ] [ -p <P THRESHOLD> ] [-h]
#UKB pQTL is in GRCh38, need to find sentinel variants position in GRCh38:
mkdir ${tmp_path}/ukb_pqtl
#input for liftOver:
awk -F ' ' 'NR > 1 {print "chr"$3, $4, $4+1}' $cs_all \
    > ${tmp_path}/ukb_pqtl/cs_vars_liftover_input.txt
## download the chain file b37 to b38
wget -P /home/n/nnp5/software/liftover https://hgdownload.soe.ucsc.edu/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz

/home/n/nnp5/software/liftover/liftOver \
    ${tmp_path}/ukb_pqtl/cs_vars_liftover_input.txt \
    /home/n/nnp5/software/liftover/hg19ToHg38.over.chain.gz \
    ${tmp_path}/ukb_pqtl/cs_vars_liftover_output.bed \
    ${tmp_path}/ukb_pqtl/cs_vars_liftover_unlifted.bed

##divide cs vars into the respective credset with b38 location:
dos2unix src/pQTL_coloc/000_preprocess_cs_b38.R
chmod +x src/pQTL_coloc/000_preprocess_cs_b38.R
Rscript src/pQTL_coloc/000_preprocess_cs_b38.R \
     $cs_all \
     $tmp_path/ukb_pqtl/ \
     ${tmp_path}/ukb_pqtl/cs_vars_liftover_output.bed

#pvalue theshold based on bonferroni correction by the number of measured proteins:
dos2unix src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh
chmod +x src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh
sbatch --array 0-16 src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh

#check that all credible sets variants have been analysed:
find ${tmp_path}/ukb_pqtl/*rs*/ -type f | wc -l

#combine look-up ukb-pqtl files with Nick's script (from /data/gen1/UKBiobank/olink/pQTL/orion_pain):
cd ${tmp_path}/ukb_pqtl/
/home/n/nnp5/PhD/PhD_project/Var_to_Gene/src/pQTL_coloc/001_combine_pqtl.awk *rs*/* \
    > ${tmp_path}/ukb_pqtl/lookup_ukbpqtl.txt
cp ${tmp_path}/ukb_pqtl/lookup_ukbpqtl.txt output/
cd /home/n/nnp5/PhD/PhD_project/Var_to_Gene/ #of wherever the folder 'Var_to_Gene' is located
#extract gene showing look-up results:
awk 'NR > 1 {print $2}' ${tmp_path}/ukb_pqtl/lookup_ukbpqtl.txt | sort -u \
    > input/ukbpqtl_var2genes_raw


###deCODE pQTL LOOK-UP###
#No look-up results for deCode
#Found this script of Nick for decode lookup (from /data/gen1/TSH/coloc_susie/lookup_decode.awk):
#create credible_set.snps: create credible_set.snps in alphabetical order
#($5 < $6 ? $5 : $6)"_"($6 > $5 ? $6 : $5)
#locus85    10_101220474_A_G
#locus85    10_101221275_C_T
mkdir ${tmp_path}/decode_pqtl
#From Chiara/Kayesha script and Nick:
sbatch src/pQTL_coloc/000_submit_lookup_decode.sh
#filter out gene name with significant pQTL:
awk 'NR > 1 && $5 > 0 {print $1}' ${tmp_path}/decode_pqtl/log_pQTL_decode_analysis | sed 's/.txt//g' \
    > input/decode_pqtl_var2genes_raw


###SCALLOP pQTL LOOK-UP###
#From Chiara's script R:\TobinGroup\GWAtraits\Chiara\pQTL_SCALLOP and Nick's script scallop_lookup.awk
mkidr ${tmp_path}/scallop_pqtl
#From Chiara and Nick script:
sbatch src/pQTL_coloc/000_submit_lookup_scallop.sh
#filter out gene name with significant pQTL:
awk 'NR > 1 && $5 > 0 {print $1}' ${tmp_path}/scallop_pqtl/log_pQTL_SCALLOP_analysis | sed 's/.txt//g' \
    > input/scallop_pqtl_var2genes_raw

##Chrom Pos MarkerName  Allele1 Allele2 Freq1   FreqSE  Effect  StdErr  P-value Direction   TotalSampleSize Gene
awk -F "\t" '$10 < 5e-8 {print $0, $13="CA-125"}' ${tmp_path}/scallop_pqtl/CA-125.txt \
    > output/scallop_ukbpqtl.txt
awk -F "\t" '$10 < 5e-8 {print $0, $13="ST2"}' ${tmp_path}/scallop_pqtl/ST2.txt \
    >> output/scallop_ukbpqtl.txt

#Merge genes from the different pQTL look-up analyses:
cat input/ukbpqtl_var2genes_raw input/scallop_pqtl_var2genes_raw input/decode_pqtl_var2genes_raw \
    | awk '{print $2="pQTL", $1}' > input/pqtl_lookup_genes_merged


################
#3 POLYGENIC PRIORITY SCORE (PoPS)
################
#https://github.com/FinucaneLab/pops
#Looking at the github repo and to Jing's code, I compiled PoPS in PoPS.sh and submit_pops.sh:

#PoPS.sh internally calls submit_pops.sh to be submitted as a job - it requires lots of time.
mkdir ${tmp_path}/pops
mkdir ${tmp_path}/pops/results
bash PoPS.sh

#Look at the results and find top score genes within a +/-250Kb from highest-PIP variant for each locus -
#if no top genes in +/- 250Kb, enlarge the window to +/-500Kb:
Rscript src/PoPS/PoPS_summary.R

#Added /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/pops_var2genes_raw.txt to the var2genes_raw.xlsx.

################
#3 NEARBY HUMAN ORTHOLOG MOUSE KO GENE
################
#From Jing's code as well as my own input:
#Download genotype-phenotype data from IMPC (International Mouse Phenotyping consortium)
#https://www.mousephenotype.org/data/release
mkdir ${tmp_path}/mouse_ko
#latest release 2023-07-06:
wget -P ${tmp_path}/mouse_ko/ https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/latest/results/genotype-phenotype-assertions-ALL.csv.gz
#latest release 2023-11-22:
wget -P ${tmp_path}/mouse_ko/ http://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_mouse_hcop_fifteen_column.txt.gz
#run the analysis:
Rscript src/mouse_ko/mouse_ko.r > ${tmp_path}/mouse_ko/output_mouse_ko
#Upload /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/mouse_ko_genes_raw.txt genes into var2genes_raw.xlsx.
cp ${tmp_path}/mouse_ko/results_500kb.csv input/mko_results_500kb.csv

################
#4 NEARBY RARE MENDELIAN DISEASE GENE
################
#From Jing's code as well as my own input:
#Download genotype-phenotype data from https://www.orphadata.com/genes/:
mkdir ${tmp_path}/rare_disease/
#latest release 2023-06-22:
wget -P ${tmp_path}/rare_disease/ https://www.orphadata.com/data/xml/en_product6.xml
#downloaded locally and converted in xlsx in Excel - uploaded in /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/en_product6.xlsx
#dowload description of the file:
wget -P src/report/ https://www.orphadata.com/docs/OrphadataFreeAccessProductsDescription.pdf
#Downloaded hpo: human phenotype ontology provides a standardized vocabulary of phenotypic abnormalities encounterd in human disease
#latest release June 2023:
#Downloaded locally and converted in xlsx in Excel - uploaded in /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/en_product4.xlsx
#https://github.com/Orphanet/Orphadata_aggregated/blob/master/Rare%20diseases%20with%20associated%20phenotypes/en_product4.xml
#run the analysis:
Rscript src/rare_disease/rare_disease.r > ${tmp_path}/rare_disease/output_rare_disease
#Upload /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/rare_disease_genes_raw.txt genes into var2genes_raw.xlsx.
cp ${tmp_path}/rare_disease/results_500kb_by_gene.csv input/raredis_results_500kb_by_gene.csv

################
#5 RARE VARIANT UKBiobank ANALYSIS in RAP
################
#Prep file ALICE3:
##Match UK Biobank application IDs:
mkdir ${tmp_path}/rare_variant/
Rscript src/rare_variant/000_dataprep_rarevar.R
#Copy the pheno covatiare file in /rfs/:
cp ${tmp_path}/rare_variant/demo_EUR_pheno_cov_broadasthma_app88144.txt /rfs/TobinGroup/data/UKBiobank/application_88144/
#Gene-collapsing rare variant analysis:
#https://github.com/legenepi/rare_collapsing
#Single rare variant analysis: NB - need to do it for rare variant ExWAS ! change filter in plink!
#src/rare_variant/submit_rare_variant.sh

################
#6 TABLES FOR EACH ANALYSIS AND MERGE GENES FOR GENE PRIORITISATION AND VISUALISATION
################
#TO NOTE: ONLY FOR
#nearest gene
#functional annotation
#eQTL
#mouse_KO
#PoPS
#pQTL
#rare_disease
#SINGLE AND GENE-BASED COLLAPSING ANALYSIS: no genes for variant-to-gene mapping, so I did not add them to this table
Rscript src/Locus_to_genes_table.R
Rscript src/genes_heatmap.R
cp output/V2G_heatmap_subplots.png /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/
cp src/report/var2gene_full.xlsx /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/

################
#7 REGION PLOTS WITH VAR2GENE RESULTS
################
#bash Region_plot_V2G.sh
#Region_plot_V2G.R
#Region_plot_V2G_2.R



#######ADDITIONAL - COMPARISON WITH VALETTE ET AL. 2021 RESULTS#############
#how many evidence for each gene?
awk '$3 == 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq -c | sort -k1 -r
awk '$3 == 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq -c | awk '$1 != 1 {print $2}' \
    > ${tmp_path}/gene_2plus_evidence
#Are these genes with 2+ evidence already described/found in asthma studies ?
##Valette et al. 2021:
wget -F ${tmp_path}/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8187656/bin/42003_2021_2227_MOESM4_ESM.xlsx
#SD17: "Supplementary Data 17. Druggability of the 806 target genes identified in this study.
#Put all the 806 gene identified by Valette et al. 2021 into this file:
nano ${tmp_path}/all_genes_Valette2021
grep -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/all_genes_Valette2021 | wc -l
#All genes are identified by Valette et al.2021
#59 genes, among which all the 2+ evidence were found by Valette et al. as well:
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq  | grep -w -F -f - ${tmp_path}/all_genes_Valette2021 | wc -l
grep -w -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/all_genes_Valette2021 | wc -l
#34 NOT IDENTIFIED BY VALETTE (but all with just one level of evidence in my analysis):
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq  | \
    grep -v -w -F -f ${tmp_path}/all_genes_Valette2021 - | sort | sed -z 's/\n/,/g;s/,$/\n/'
#In green are 29 target genes that overlapped among lung TWAS genes, blood eGenes, and chromatin contact genes."
#Put the 29 genes into this file:
#9 genes in the 29 with three level of evidence: 4 in the 2+ my level of evidence, 5 just one evidence.
nano ${tmp_path}/target_genes_Valette2021
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq  | grep -w -F -f - ${tmp_path}/target_genes_Valette2021
grep -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/target_genes_Valette2021
#the four with two+ level of evidence:
#CAMK4
#IKZF3
#IL4R
#SMAD3
#The five with one level of evidence:
#GSDMB
#HLA-DQB1
#MED1
#RAD50
#TAP2
#Supplementary Data 18. PheWAS for the 40 genes prioritized as therapeutic targets for asthma.
#aOverall association score for asthma from the Open Targets Platform (ref21).
#bDGIdb, Drug-gene interaction database (ref22).
#cDruggable genome (ref23).
#dAsthma drug targets derived from El-Husseini et al. (ref20)
#Put the 40 genes into this file:
#9 in total: 4 in the 2+ my level of evidence, 5 just one evidence.
nano ${tmp_path}/druggable_genes_Valette2021
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq  | grep -w -F -f - ${tmp_path}/druggable_genes_Valette2021
grep -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/druggable_genes_Valette2021
#the four with two+ level of evidence:
#IL1RL1
#RORA
#SMAD3
#IL4R
#The five with one level of evidence:
#IL18RAP
#CCR4
#IL13
#IL33
#ERBB3

#Targets of existing asthma drugs are in bold - taken from Table 4 of Valette et al 2021.
#CCR4, IL13,IL2RA,IL4R,IL5,IL6,SMAD3,TNFSF4,TSLP
nano ${tmp_path}/existing_asthma_drugtarget
#Two genes in the two+ level of evidence among the gene already targeted for asthma:
#IL4R,SMAD3
awk 'NR > 1 {print $1}' output/v2g_gene_prioritisation.txt | sort | uniq  | grep -w -F -f - ${tmp_path}/existing_asthma_drugtarget
#CCR4
#IL13
#IL4R
#SMAD3
grep -F -f ${tmp_path}/gene_2plus_evidence ${tmp_path}/existing_asthma_drugtarget
#IL4R
#SMAD3


I used several methods, mainly following the previous study of our group, Shrine et al. 2023 (https://www.nature.com/articles/s41588-023-01314-0).

Analysis

Variant Annotation


Code
src/Variant_annotation_FAVOR.R

#!/usr/bin/env Rscript

#Rationale: Variant Annotation with FAVOR

#Run as:
#Rscript src/ \


sink(stderr())

suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
suppressMessages(library(corrplot))
#suppressMessages(library(xlsx))
#suppressMessages(library(qdap))
suppressMessages(library(VennDiagram))
library(RColorBrewer)

#args <- commandArgs(T)
#favor_file <- args[1]
favor_file <- "/alice-home/3/n/nnp5/PhD/PhD_project/Var_to_Gene/input/FAVOR_credset_chrpos38_2023_08_08.csv"
favor <- fread(favor_file)

favor.digest <- favor %>% select("Variant (VCF)", "Chromosome", "Position", "rsID", "Genecode Comprehensive Category",
                                 "Genecode Comprehensive Info", "Genecode Comprehensive Exonic Category", "Genecode Comprehensive Exonic Info",
                                 "CAGE Promoter", "CAGE Enhancer", "GeneHancer", "SuperEnhancer", "CADD phred", "Fathmm XF",
                                 "aPC-Protein-Function", "aPC-Conservation", "aPC-Epigenetics-Active", "aPC-Epigenetics-Repressed",
                                 "aPC-Epigenetics-Transcription", "aPC-Local-Nucleotide-Diversity", "aPC-Mutation-Density",
                                 "aPC-Transcription-Factor", "aPC-Mappability", "Clinical Significance",
                                 "Clinical Significance (genotype includes)", "Disease Name",
                                 "Disease Name (included variant)", "Review Status", "Allele Origin",
                                 "Disease Database ID", "Disease Database ID (included variant)", "Gene Reported")

colnames(favor.digest) <- make.names(colnames(favor.digest), unique=TRUE)

#FAVOR - Nearest Gene to variants with highest PIP in the locus:
#Identify whether variants cause protein coding changes using Gencode genes definition systems
#it will label the gene name of the variants has impact, if it is intergenic region
#the nearby gene name will be labeled in the annotation.
##Retrieve Sentinel SNPs - highest PIP in the fine-mapped loci:
#cp /scratch/gen1/nnp5/Var_to_Gen_tmp/ukb_pqtl/cs_vars_liftover_output.bed /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/
sig_list_tmp <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt")
setnames(sig_list_tmp,"chromosome","chr")
setnames(sig_list_tmp,"position","posb37")
sig_list_tmp$sentinel <- paste0(sig_list_tmp$chr,"_",sig_list_tmp$posb37,"_",sig_list_tmp$allele1,"_",sig_list_tmp$allele2)
locus <- unique(sig_list_tmp$locus)

sentinels <- data.frame(matrix(ncol = 4,nrow = 0))
colnames(sentinels) <- c("locus","sentinel","chr","posb37")
for(i in locus){
    locus_sig_list <- sig_list_tmp %>% filter(locus == as.character(i))
    locus_sig_list <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(locus,sentinel,chr,posb37)
    sentinels <- rbind(sentinels,locus_sig_list)
    }

#liftover fo rhte senitnel to find b38:
posb38 <- fread("input/highestPIP_sentinels_hglft_genome_b38.bed") %>% rename(posb38=V2)
posb38  <- posb38 %>% separate(V4, c("chr", "pos2", "posb37"))
posb38  <- posb38 %>% select(chr, posb37, posb38)
posb38$chr <- as.numeric(gsub("chr","",posb38$chr))
posb38$posb37 <- as.numeric(posb38$posb37)
sentinel_b38 <- sentinels %>% left_join(posb38, by =c("chr", "posb37")) %>% rename(Chromosome=chr, Position=posb38)



###Nearest genes:
favor.digest.NG <- favor.digest %>% select(Chromosome, Position, "Genecode.Comprehensive.Info") %>% rename(Nearest_gene="Genecode.Comprehensive.Info")
sentinel_b38_NG <- sentinel_b38 %>% left_join(favor.digest.NG, by = c("Chromosome", "Position")) %>% rename(posb38=Position)
#Take closet genes when multiples are annotated:
#1:AP001189.5
#5:TSLP
#8:IL33
#9:IL1RL1
#16:HLA-DQA1
#17:AC044784.1
sentinel_b38_NG$Nearest_gene[1] <- "AP001189.5"
sentinel_b38_NG$Nearest_gene[5] <- "TSLP"
sentinel_b38_NG$Nearest_gene[8] <- "IL33"
sentinel_b38_NG$Nearest_gene[9] <- "IL1RL1"
sentinel_b38_NG$Nearest_gene[16] <- "HLA-DQA1"
sentinel_b38_NG$Nearest_gene[17] <- "AC044784.1"
fwrite(sentinel_b38_NG,"output/PIP_sentinels_nearestgenes",sep="\t",quote=F)
nearest_gene <- unique(unlist(strsplit(sentinel_b38_NG$Nearest_gene,",")))

#Categorical annotation:
##Exploratory:
summary(as.factor(favor.digest$"Genecode.Comprehensive.Category"))
summary(as.factor(favor.digest$"Genecode.Comprehensive.Info"))
##Info for the 5 coding variants:
summary(as.factor(favor.digest$"Genecode.Comprehensive.Exonic.Category"))
summary(as.factor(favor.digest$"Genecode.Comprehensive.Exonic.Info"))

#FAVOR functional:
#FANTOM5 CAGE promoter or enhancer, Functional Integrative score > 15, ClinVar
#CAGE promoter or enhancer (from FANTOM5):
fantom5 <- favor.digest %>% filter(!is.na(CAGE.Promoter) | !is.na(CAGE.Enhancer))
fantom5_genes <- unique(fantom5$Genecode.Comprehensive.Info)
fantom5_df <- fantom5 %>% select(Chromosome, Position, "Genecode.Comprehensive.Info", CAGE.Promoter, CAGE.Enhancer) %>% rename(Nearest_gene="Genecode.Comprehensive.Info")
fwrite(fantom5_df,"output/fnc_annot_fantom5",sep="\t",quote=F,row.names=F)


#Functional integrative score:
#Among 562 (53 removed because NAs) there are a correlations between aPC.Epigenetics.Active and aPC.Transcription.Factor (0.75), between aPC.Mutation.Density
#and aPC.Local.Nucleotide.Diversity (0.80); between CADD.phred and aPCs.Conservation (0.83).
favor.aPCs <- favor.digest %>% select("CADD.phred","aPC.Protein.Function", "aPC.Conservation","aPC.Epigenetics.Active",
                                      "aPC.Epigenetics.Repressed","aPC.Epigenetics.Transcription","aPC.Local.Nucleotide.Diversity",
                                      "aPC.Mutation.Density", "aPC.Transcription.Factor", "aPC.Mappability")
favor.aPCs <- na.omit(favor.aPCs)
M <- cor(favor.aPCs)
head(round(M,3))
corrplot(M, method="number")

##Integrative score > 15:
inscores <- favor.digest %>% filter(CADD.phred > 15 | aPC.Protein.Function > 15 | aPC.Conservation > 15 | aPC.Epigenetics.Active > 15 | aPC.Epigenetics.Repressed > 15 |
aPC.Epigenetics.Transcription > 15 | aPC.Local.Nucleotide.Diversity > 15 | aPC.Mutation.Density > 15 | aPC.Transcription.Factor > 15 | aPC.Mappability > 15)
inscores_genes <- unique(inscores$Genecode.Comprehensive.Info)

#Among 99 integrative score > 15 variants, these correlations change a little: between aPC.Epigenetics.Active and aPC.Transcription.Factor (0.69), between aPC.Mutation.Density
#and aPC.Local.Nucleotide.Diversity (0.63); between CADD.phred and aPCs.Conservation (0.91).
inscores.aPCs <- inscores %>% select("CADD.phred","aPC.Protein.Function", "aPC.Conservation","aPC.Epigenetics.Active",
                                      "aPC.Epigenetics.Repressed","aPC.Epigenetics.Transcription","aPC.Local.Nucleotide.Diversity",
                                      "aPC.Mutation.Density", "aPC.Transcription.Factor", "aPC.Mappability")
M2 <- cor(inscores.aPCs)
head(round(M,3))
#Save the corplot x the report:
png(file = "/home/n/nnp5/PhD/PhD_project/Var_to_Gene/output/corrplot_integrative_score_15.png")
corplot_integrativescore <- corrplot(M2, method="number")
dev.off()

inscores.aPCs_df <- inscores %>% select(Chromosome, Position, Genecode.Comprehensive.Info, "CADD.phred","aPC.Protein.Function", "aPC.Conservation","aPC.Epigenetics.Active",
                                      "aPC.Epigenetics.Repressed","aPC.Epigenetics.Transcription","aPC.Local.Nucleotide.Diversity",
                                      "aPC.Mutation.Density", "aPC.Transcription.Factor", "aPC.Mappability")
fwrite(inscores.aPCs_df,"output/fnc_annot_inscores",sep="\t",quote=F,row.names=F)

#Clinical annotation:
clinvar <- favor.digest %>% filter(!is.na(Clinical.Significance))
clin_genes <- unique(clinvar$Gene.Reported)
clinvar_df <- clinvar %>% select(Chromosome, Position, Clinical.Significance, Gene.Reported)
fwrite(clinvar_df,"output/fnc_annot_clinvar",sep="\t",quote=F,row.names=F)

#Genes from Variant Annotation:
#fantom5_genes, inscores_genes, clin_genes
##Polish the genes (remove GeneID, remove ensembl, remove where mutation occurs, keep only gene name)
#fantom_genes: everything inside parenthesis needs to be deleted, and divide genes listed together
fantom5_genes <- bracketX(fantom5_genes) %>% unique()
fantom5_genes <- unlist(strsplit(fantom5_genes, "\\s*,\\s*"))

#inscores_genes: everything inside parenthesis needs to be deleted, and divide genes listed together
inscores_genes <- bracketX(inscores_genes) %>% unique()
inscores_genes <- unlist(strsplit(inscores_genes, "\\s*,\\s*"))
inscores_genes <- unlist(strsplit(inscores_genes, ';', fixed=TRUE))

#clin_genes: separate element if '|' present, and then remove GeneID-after the ':'
clin_genes <- unlist(strsplit(clin_genes, '|', fixed=TRUE))
clin_genes <- gsub(":.*", "", clin_genes) %>% unique()

#Save the genes:
fwrite(as.data.frame(nearest_gene),"input/nearest_genes_raw",row.names=FALSE, col.names=FALSE,quote=F)
write.xlsx(varannot_genes,"/alice-home/3/n/nnp5/PhD/PhD_project/Var_to_Gene/input/var2genes_raw.xlsx",sheetName = "varannot_genes", row.names=FALSE, col.names=FALSE)

Variant annotation can be represented in qualitative terms, such as variant effect predictor category or in quantitative terms with a score including protein function, conservation, epigenetics, integrative function (ref https://www.nature.com/articles/s41588-020-0676-4). When dealing with quantitative measures for the same functional category,these can be summed up using principal components, as implemented for the first time in the STAAR framework (ref https://www.nature.com/articles/s41588-020-0676-4) and adopted in Functional Annotation of Variants Online Resources (FAVOR) as well. Annotation Principal Component (aPC) is the first PC among standardised individual score for a specific functional category. AnnotationPC is transformed in PHRED-scale score, as CADD. CADD authors suggest 15 as a good threshold for functionally relevant variants (https://cadd.gs.washington.edu/info).
FAVOR aggregates different sources and categories of annotation. I decided to focus on:

  1. Nearest gene for each credible set top PIP variant (“Genecode.Comprehensive.Info”);

  2. Functional annotation for all 615 credible set variants looking at either:

    2.1. FANTOM5 Gene Enhancer and/or Gene Promoter (“CAGE.Promoter”, “CAGE.Enhancer”)

    2.2. Integrative scores > 15 (“CADD.phred”,“aPC.Protein.Function”,“aPC.Conservation”,“aPC.Epigenetics.Active”, “aPC.Epigenetics.Repressed”,“aPC.Epigenetics.Transcription”,“aPC.Local.Nucleotide.Diversity”,“aPC.Mutation.Density”, “aPC.Transcription.Factor”, “aPC.Mappability”);

    2.3. Clinical annotation from ClinVar (“Clinical.Significance”, “Gene.Reported”)

Colocalisation with expression quantitative trait loci (QTL) analysis

Code
src/coloc/000_preprocess_cs.R

#!/usr/bin/env Rscript

#Rationale: Create separate credible sets file for each genomic locus.

args     = commandArgs(trailingOnly = TRUE)
cred_set = args[1]
tmp_path = args[2]

library(tidyverse)
library(data.table)

cs <- fread(cred_set)
#filter out the locus on the MHC region
pheno_tmp <- cs %>% filter(locus != "6_rs9271365_32086794_33086794")
pheno <- unique(pheno_tmp$locus)

#split credset df into separate file for each locus:
##NB: allele2 is the effect allele, I change them because in /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat a1 is the effect allele, a2 is the non effect allele

for (i in pheno){
cs_tmp <- cs %>% filter(locus == as.character(i))
chr <- cs_tmp %>% filter(PIP_average == max(cs_tmp$PIP_average))  %>% select(chromosome)
location <- cs_tmp %>% filter(PIP_average == max(cs_tmp$PIP_average)) %>% select(position)
allele1 <- cs_tmp %>% filter(PIP_average == max(cs_tmp$PIP_average))  %>% select(allele2)
allele2 <- cs_tmp %>% filter(PIP_average == max(cs_tmp$PIP_average))  %>% select(allele1)
fwrite(cs_tmp, paste0(tmp_path,"SA_",chr,"_",location,"_",allele1, "_",allele2), quote=F, sep="\t")}

src/coloc/000A_submit_eqtl_gtex_extraction.sh

#!/bin/bash

#==================================================================
# File:        000A_submit_eqtl_gtex_extraction.sh
# Project:     SevereAsthma
# Author:      NNP
# Date:        31 October 2023
# Rationale:   Submit .
# Submission:  sbatch --array=1-22 src/coloc/000A_submit_eqtl_gtex_extraction.sh
#==================================================================

#SBATCH --job-name=eqtl_gtex_extraction
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=3:0:0
#SBATCH --mem=100gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --array=1-22           # array ranks to run


i=$SLURM_ARRAY_TASK_ID
module load R
Rscript src/coloc/000A_eqtl_gtex_extraction.R ${i}

src/coloc/000A_eqtl_gtex_extraction.R

#!/usr/bin/env Rscript

#==================================================================
# File:        000A_eqtl_gtex_extraction.R
# Project:     SevereAsthma
# Author:      NNP - edited from KC
# Date:        31 October 2023
# Rationale:   Load eQTL data for relevant tissues and and extract
#              chromosome and positions for liftOver.
#==================================================================

suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
#Sys.setenv(LIBARROW_BINARY = TRUE); install.packages('arrow', type = "source")
suppressMessages(library(arrow))

options(scipen=999)

args <- commandArgs(T)

chr <- args[1]

setwd("/data/gen1/reference/GTEx_Analysis_v8_QTLs/GTEx_Analysis_v8_EUR_eQTL_all_associations")

# Identify files ----
file.names = list.files(path = ".", pattern = paste0("*.v8.EUR.allpairs.chr", chr, ".parquet")) %>%
  #Kayesha:
  #str_subset(pattern = "Adrenal|Artery|Blood|Brain|Esophagus|Heart|Ileum|Liver|Lung|Muscle|Nerve|Pituitary|Spleen|Stomach")
  str_subset(pattern = "Skin|Colon")

for (file in file.names) {

  # Read in data ----
  name = str_remove(file, ".parquet")
  cat(paste0("Tissue: ", name, "\n"))
 
  cat("Loading data...\n")
  df <- read_parquet(file)

  # Separate variant_id column into chromosome, position, reference allele and alternative allele ----
  cat("Wrangling data...\n")
  sep <- df %>%
    separate(variant_id, c("CHROM", "POS", "REF", "ALT")) %>%
    mutate(CHROM = str_replace(CHROM, "chr", ""),
           CHROM = str_replace(CHROM, "X", "23"),
           CHROM = as.numeric(CHROM),
           POS = as.numeric(POS),
           ID = paste0(CHROM, "_", POS, "_", pmin(REF, ALT), "_", pmax(REF, ALT)))

    ## Write file ----
    cat(paste0("Writing file: ", name, ".hg38.txt.gz", "\n"))
    fwrite(sep, file = paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/liftover_gtexv8/", name, ".hg38.txt.gz"), quote = F, sep = "\t", na = "NA", row.names = F, col.names = T, compress = "auto")


  # Generate BED file for liftOver ----
  cat(paste0("Creating bed file: ", name, ".hg38.bed", "\n"))
  sep %>%
    select(chrom = CHROM, chromStart = POS, ID) %>%
    mutate(chromEnd = as.integer(chromStart+1),
           chrom = paste0("chr", chrom),
           chrom = str_replace(chrom, "chr23", "chrX")) %>%
    select(chrom, chromStart, chromEnd, ID) %>%
    fwrite(file = paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/liftover_gtexv8/bed/", name, ".hg38.bed"), quote = F, sep = "\t", na = "NA", row.names = F, col.names = F)
  
  cat("Done.\n\n")
  
  }

src/coloc/000B_eqtl_gtex_liftover.sh

#!/bin/bash

#==================================================================
# File:        000B_eqtl_gtex_liftover.sh
# Project:     SevereAsthma
# Author:      NNP- edited from KC
# Date:        31 October 2023
# Rationale:   Liftover eQTL coordinates from GRCh37 to GRCh38.
# Submission:  qsub 02B_eqtl_gtex_liftover.sh
#==================================================================

#SBATCH --job-name=eqtl_gtex_liftover
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=23:0:0
#SBATCH --mem=100gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk


FILE_PATH="/scratch/gen1/nnp5/Var_to_Gen_tmp/liftover_gtexv8/bed"
liftOver="/home/n/nnp5/software/liftOver"
chain="/data/gen1/reference/liftOver/hg38ToHg19.over.chain"

cd ${FILE_PATH}

for filename in *.hg38.bed
do

  name=$(basename ${filename} .hg38.bed)

  ${liftOver} \
  ${filename} \
  ${chain} \
  ${name}.hg19.bed \
  ${name}.unlifted.bed

done

src/coloc/000C_submit_eqtl_gtex_conversion.sh

#!/bin/bash

#==================================================================
# File:        000A_submit_eqtl_gtex_conversion.sh
# Project:     SevereAsthma
# Author:      NNP
# Date:        31 October 2023
# Rationale:   Submit .
# Submission:  sbatch --array=1-22 src/coloc/000C_submit_eqtl_gtex_conversion.sh
#==================================================================

#SBATCH --job-name=eqtl_gtex_extraction
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=3:0:0
#SBATCH --mem=100gb
#SBATCH --account=gen1
#SBATCH --export=NONE
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --array=1-22           # array ranks to run


i=$SLURM_ARRAY_TASK_ID
module load R
Rscript src/coloc/000C_eqtl_gtex_conversion.R ${i}

src/coloc/000C_eqtl_gtex_conversion.R

#!/usr/bin/env Rscript

#==================================================================
# File:        000C_eqtl_gtex_conversion.R
# Project:     SevereAsthma
# Author:      NNP-edited from KC
# Date:        31 October 2023
# Ratinale:    Update coordinates in original GTeX files to GRCh37.
#==================================================================

suppressMessages(library(tidyverse))
suppressMessages(library(data.table))

options(scipen=999)

args <- commandArgs(T)

chr <- args[1]

setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/liftover_gtexv8")

# Identify files ----
file.names = list.files(path = ".", pattern = paste0("*.v8.EUR.allpairs.chr", chr, ".hg38.txt.gz")) %>%
  #Kayesha
  #str_subset(pattern = "Adrenal|Artery|Blood|Brain|Esophagus|Heart|Ileum|Liver|Lung|Muscle|Nerve|Pituitary|Spleen|Stomach")
  str_subset(pattern = "Colon|Skin")

for (file in file.names) {


  # Read in data ----
  name = str_remove(file, ".hg38.txt.gz")
  cat(paste0("Tissue: ", name, "\n"))

  ## GTeX
  hg38 <- as_tibble(fread(file))
  ## Liftover file (hg19)
  liftover <- as_tibble(fread(paste0("bed/", name, ".hg19.bed"), header = FALSE))


  # Remove chromosomes from liftover bed (hg19) which are alternative contigs, and filter for only chromosome of GTeX data ----
  ## As multiple duplicate variants are present, remove these
  cat("Wrangling liftOver data...\n")
  if (chr == "X") {

    liftover <- liftover %>%
      select(CHROM = V1, GENPOS = V2, ID = V4) %>%
      mutate(CHROM = str_replace(CHROM, "chr", ""),
             CHROM = str_replace(CHROM, "X", "23")) %>%
      filter(CHROM == 23) %>%
      mutate(CHROM = as.integer(CHROM)) %>%
      distinct(ID, .keep_all = TRUE)
    #arrange(CHROM, GENPOS)

  } else {
  
    liftover <- liftover %>%
      select(CHROM = V1, GENPOS = V2, ID = V4) %>%
      mutate(CHROM = str_replace(CHROM, "chr", ""),
             CHROM = str_replace(CHROM, "X", "23")) %>%
      filter(CHROM == chr) %>%
      mutate(CHROM = as.integer(CHROM)) %>%
      distinct(ID, .keep_all = TRUE)
    }
  

  # Perform mapping ----
  cat("Performing mapping...\n")
  hg19 <- hg38 %>%
    right_join(liftover, by = "ID") %>%
    select(-CHROM.x, -POS) %>%
    rename(CHROM = CHROM.y, POS = GENPOS) %>%
    mutate(ID = paste0(CHROM, "_", POS, "_", pmin(REF, ALT), "_", pmax(REF, ALT))) %>%
    relocate(c(CHROM, POS, ID), .after = phenotype_id) %>%
    select(-tss_distance)        # Will no longer apply to new coordinates


  # Write file ----
  cat(paste0("Writing file: ", name, ".hg19.txt.gz", "\n"))
  fwrite(hg19, paste0(name, ".hg19.txt.gz"), quote = F, sep = "\t", na = "NA", row.names = F, col.names = T, compress = "auto")
  cat("Done.\n\n")
  }

src/coloc/001_submit_GWASpairs.sh

#!/bin/bash

#SBATCH --job-name=SA_GWASpairs
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=16gb
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --account=gen1
#SBATCH --export=NONE

module load R

Rscript src/coloc/001_run_GWASpairs.R $CREDSET

src/coloc/001_run_GWASpairs.R

library(data.table)
library(tidyverse)

args     = commandArgs(trailingOnly = TRUE)
cred_set = args[1]

tmp      = unlist(strsplit(cred_set, split="_"))
chr      = as.numeric(tmp[2])
location = as.numeric(tmp[3])

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"

gwas <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat") %>%
  filter(b37chr==chr, between(bp, location-5e5, location+5e5))

write_delim(x=gwas, file=paste0(tmp_path, cred_set, "_GWASpairs.txt.gz"))

src/coloc/002_prepare_LDinput.R

#!/usr/bin/env Rscript

#Rationale: combine names in a document to create LD .raw file

args     = commandArgs(trailingOnly = TRUE)
tissue = args[1]

#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"

library(tidyverse)


df <- data.frame(cred_set = c("SA_8_81292599_C_A","SA_6_90963614_AT_A","SA_5_110401872_C_T",
    "SA_2_242692858_T_C","SA_15_67442596_T_C","SA_12_56435504_C_G","SA_11_76296671_G_A",
    "SA_9_6209697_G_A","SA_5_131885240_C_G","SA_3_33042712_C_T",
    "SA_2_102913642_AAAAC_A","SA_17_38168828_G_A","SA_16_27359021_C_A","SA_15_61068954_C_T",
    "SA_12_48202941_T_C","SA_10_9064716_C_T")) %>%
  as_tibble %>%
  separate(cred_set, c("pheno", "chr", "pos", "a1", "a2"), sep="_")

df$ancestry <- "EUR"

df <- df %>%
  unite("snp", chr:a2, remove=FALSE) %>%
  mutate(eQTL  = tissue,
         start = as.numeric(pos) - 5e5,
         end   = as.numeric(pos) + 5e5) %>%
  select(pheno, ancestry, snp, chr, pos, eQTL, start, end)

write_tsv(x=df, file=paste0(tmp_path,"gtex_Pairs_lookup.txt"),col_names=F, append=T)

src/coloc/002_get_LD.sh

#!/bin/bash

#SBATCH --job-name=ld
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=1:0:0
#SBATCH --mem=24gb
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --account=gen1
#SBATCH --export=NONE

module load plink/1.9-beta6.27-vhw5dr2
module load R

#For GTExv8:
#pairs_lookup_file="/scratch/gen1/nnp5/Var_to_Gen_tmp/gtex_Pairs_lookup.txt"
#DIR="/scratch/gen1/nnp5/Var_to_Gen_tmp"

#For eqtlGen:
#pairs_lookup_file="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen/eqtlGenWB_Pairs_lookup.txt"
#DIR="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen"

#For UBCLung:
pairs_lookup_file="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/UBCLung_Pairs_lookup.txt"
DIR="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung"

cd ${DIR}

while IFS='' read -r LINE || [ -n "${LINE}" ]
do

chr=`echo ${LINE} | awk '{print $4}'`
pheno=`echo ${LINE} | awk '{print $1}'`
signal=`echo ${LINE} | awk '{print $3}'`
eQTL=`echo ${LINE} | awk '{print $6}'`
start=`echo ${LINE} | awk '{print $7}'`
end=`echo ${LINE} | awk '{print $8}'`
anc=`echo ${LINE} | awk '{print $2}'`


name="${DIR}/${signal}_${pheno}_${eQTL}"

plink_script="${name}_plink.sh"
 if [ -f "$plink_script" ]; then
     echo "PLINK script exists, now deleted and created a new one"
       rm $plink_script
 fi

 if [ ${anc}=="EUR" ]; then
    ref_ld="/data/gen1/LF_HRC_transethnic/signal_selection/LDpanel/EUR_UKB/ukb_imp_chr${chr}_EUR_selected"
 else
    ref_ld="/data/gen1/LF_HRC_transethnic/signal_selection/LDpanel/AFR_1000G/ALL.chr${chr}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes_Amin_Amax_noesv_AFR"
 fi

# Create backslash string
bs="\\"

#create regions to extract
region_file="${name}_region.txt"
echo -e ${chr}'\t'${start}'\t'${end} > ${region_file}

# Create script file
cat <<EOT>> ${plink_script}

/home/n/nnp5/software/plink2 $bs
--bfile $ref_ld $bs
--extract range ${region_file} $bs
--make-bed $bs
--out ${name}
EOT

sh ${plink_script}

# Now recode
plink_scriptR="${name}_plink_recode.sh"
if [ -f "$plink_script" ]; then
    echo "PLINK script exists, now deleted and created a new one"
       rm $plink_scriptR
fi

# Create script file
cat <<EOT >> ${plink_scriptR}

/home/n/nnp5/software/plink2 $bs
--bfile ${name} $bs
--out ${name} $bs
--recode A

EOT

sh ${plink_scriptR}

done < ${pairs_lookup_file}

src/coloc/003_submit_coloc_susie_GTEx.sh

#!/bin/bash

#SBATCH --job-name=coloc_susie_gtex
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=32gb
#SBATCH --account=gen1
#SBATCH --export=NONE

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"


i=$SLURM_ARRAY_TASK_ID
read GENE < <(awk -F'\t' 'NR == '$((i))' { print $1 }' ${tmp_path}${CREDSET}_${TISSUE}_genes.txt)

module load R

Rscript ./src/coloc/003_run_coloc_susie_GTEx.R $TISSUE $CREDSET $GENE





#for c in ${!cs[*]}; do

#  N=`cat ${tmp_path}${cs[c]}_${tissue[t]}_genes.txt | wc -l`

#  sbatch --array=1-${N}%20 --export=TISSUE="${tissue}",CREDSET="${cs[c]}" ${tmp_path}003_submit_coloc_susie_GTEx.sh

# sleep 5

#done

src/coloc/003_run_coloc_susie_GTEx.R

#!/usr/bin/env Rscript

#Rationale: Run coloc and coloc.susie in GTExV8 tissues

suppressMessages(library(tidyverse))
library(coloc)
suppressMessages(library(Rfast))

args     = commandArgs(trailingOnly = TRUE)
tissue   = args[1]
cred_set = args[2]
gene     = args[3]

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"

cred_set %>%
  as_tibble %>%
  separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
  select(pheno) %>%
  pull -> pheno

cred_set %>% 
  as_tibble %>% 
  separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
  unite("signal", chr:a2) %>% 
  pull -> signal

############################
## Read asthma GWAS sumstat, eQTL GWAS, and LD matrix
############################
GWAS = read_delim(paste0(tmp_path, cred_set, "_GWASpairs.txt.gz")) %>%
  rename(chr = b37chr, pos = bp, beta = LOG_ODDS, SE = se) %>%
  select(-snpid) %>%
  arrange(chr, pos) %>%
  drop_na(beta) %>%
  mutate(varbeta = SE^2) %>%
  rename(position=pos)
GWAS$allele1.gwas <- GWAS$a1
GWAS <- GWAS %>% mutate(snp = paste0(chr, "_", position, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
  select(c(snp,beta,SE,eaf,pval,MAF,varbeta,allele1.gwas)) %>%
  distinct(snp, .keep_all=TRUE)

GWAS$N <- as.numeric(38405)

eqtlGWAS = read_delim(paste0(tmp_path, cred_set, "_", tissue, "_pairs.txt.gz")) %>%
  mutate(ID      = gsub(x=ID, pattern=":", replacement="_"),
         varbeta = se^2) %>%
  arrange(chrom, pos)  %>%
  drop_na(beta) %>%
  filter(gene_id==gene, 
         ID %in% GWAS$snp) %>%
  rename(snp=ID, position=pos, MAF=maf)

GWAS %>% filter(snp %in% eqtlGWAS$snp) -> GWAS


############################
#Do colocalisation ONLY IF GWAS and eqtlGWAS (eQTL tissue-gene) contains pvalue <= 5x10-6.
############################
GWAS_sign <- GWAS %>% filter(pval <= 0.000005)
eqtlGWAS_sign <- eqtlGWAS %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No colocalisation is possible because eQTL data has pvalue >= 5x10-6."))
}
if (dim(GWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No colocalisation is possible because GWAS data has pvalue >= 5x10-6."))
} else {
    paste0(signal," ",tissue, " ", gene, " GTExv8: Starting colocalisation because both GWAS and eQTL data has pvalue <= 5x10-6.")
}

############################
# * if also want coloc results
############################
coloc_all = coloc.abf(dataset1=list(beta=GWAS$beta, varbeta=GWAS$varbeta,
                                   N=GWAS$N, type="cc", MAF=GWAS$MAF, snp=GWAS$snp),
                     dataset2=list(beta=eqtlGWAS$beta, varbeta=eqtlGWAS$varbeta, 
                                   N=eqtlGWAS$N, type="quant", MAF=eqtlGWAS$MAF, snp=eqtlGWAS$snp))

saveRDS(coloc_all, paste0(tmp_path,"results/gtex/",cred_set, "_", tissue, "_", gene, "_all_coloc.rds"))

#look at the results:
coloc <- readRDS(paste0(tmp_path,"results/gtex/", cred_set, "_", tissue, "_", gene, "_all_coloc.rds"))
#sensitivity plot:
sensitivity(coloc,"H4 > 0.8")
#


############################
##COLOC.SUSIE to allow presence of multiple causal variants
############################
# 1) Format LD matrix
############################
ld = data.table::fread(
    paste0(tmp_path, signal, "_", pheno, "_", tissue, ".raw")
  ) %>% 
  select(!c(FID:PHENOTYPE))

ld %>% 
  names %>% 
  as_tibble %>% 
  separate(value,c("c", "p", "a1", "a2")) %>% 
  mutate(snp = paste0(c, "_", p, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
  select(snp) %>%
  pull -> snps

ld = t(ld) %>%
  as_tibble %>%
  mutate(var=apply(., 1, var, na.rm=TRUE),
         snp=snps) %>%
  filter(var!=0) %>%
  filter(snp %in% GWAS$snp) %>%
  filter(snp %in% eqtlGWAS$snp)

N = ncol(ld)-2 
X = t(as.matrix(ld[,1:N]))
colnames(X) = ld %>% 
  select(snp) %>% 
  pull # Sdd names to facilitate filtering Nas (still some NAs even with pairwise.complete.obs)
LDmatrix = cor(X, use="pairwise.complete.obs") # Pearson's correlation, takes some time...
LDmatrix[is.na(LDmatrix)] <- 0 

############################
# 2) Format asthma GWAS
############################
bim = read_tsv(
    paste0(tmp_path, signal, "_", pheno, "_", tissue, ".bim"),
    col_names=c("chr", "snp", "morgans", "position", "allele1", "allele2")
  ) %>% 
  select(!morgans) # Read bim file to check same set of SNPs and allele alignment

GWAS_df <- GWAS %>%
  filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
  inner_join(bim) %>%
  mutate(beta=ifelse(allele1!=allele1.gwas, -(beta), beta)) # Check allele alignment

#Check GWAS has at least one variant with pval <= 0.000005:
GWAS_sign <- GWAS_df %>% filter(pval <= 0.000005)
if (dim(GWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No coloc.susie is possible because GWAS data has pvalue >= 5x10-6."))
}

GWAS_df %>%
  as.list -> GWAS

GWAS$type = "cc"
N = as.integer(mean(GWAS$N))
GWAS$N = N
GWAS$LD = LDmatrix

############################
# 3) Format eQTL GWAS  
############################
eqtlGWAS_df <- eqtlGWAS  %>%
  filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs 
  inner_join(bim) %>%
  mutate(beta=ifelse(allele1!=ref, -(beta), beta)) # Check allele alignment

#Check eqtlGWAS has at least one variant with pval <= 0.000005:
eqtlGWAS_sign <- eqtlGWAS_df %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No coloc.susie is possible because eQTL-GWAS data has pvalue >= 5x10-6."))
} else {
    paste0(signal," ",tissue, " ", gene, " GTExv8: Starting coloc.susie because both GWAS and eQTL data has pvalue <= 5x10-6.")
}

eqtlGWAS <- eqtlGWAS_df %>% as.list

eqtlGWAS$type = "quant"
eqtlGWAS$LD = LDmatrix
N = as.integer(mean(eqtlGWAS$N))
eqtlGWAS$N = N


############################
# Check datasets are ok
############################
check_dataset(eqtlGWAS, req="LD") -> eqtlGWAS_check
check_dataset(GWAS, req="LD") -> GWAS_check

if (is.null(GWAS_check) & is.null(eqtlGWAS_check)){
  cat(paste0(cred_set, "_", tissue, "_", gene, ": ok", "\n"), 
      file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
} else {
  cat(paste0(cred_set, "_", tissue, "_", gene, ": warning", "\n"), 
      file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
}


############################
## Run SuSie
############################
susie_GWAS = runsusie(GWAS, r2.prune=0.2, check_R=FALSE)
susie_eQTLGWAS = runsusie(eqtlGWAS, r2.prune=0.2, check_R=FALSE)
susie_all = coloc.susie(susie_GWAS, susie_eQTLGWAS)

saveRDS(susie_all, paste0(tmp_path,"results/gtex/", cred_set, "_", tissue, "_", gene, "_all_susie.rds"))
write_tsv(susie_all$summary %>% as_tibble,
          paste0(tmp_path, "results/gtex/", cred_set, "_", tissue, "_", gene, "_susie_summary.tsv"))

src/coloc/000_submit_edit_eQTLGen.sh

#!/bin/bash

#SBATCH --job-name=edit_eQTLGen
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=8:0:0
#SBATCH --mem=80gb
#SBATCH --account=gen1
#SBATCH --export=NONE

module load R

Rscript src/coloc/000_run_edit_eQTLGen.R

src/coloc/000_run_edit_eQTLGen.R

#!/usr/bin/env Rscript

#==================================================================
# File:        000_run_edit_eQTL.R
# Project:     SevereAsthma
# Author:      NNP - edited from AW
# Date:        1 November 2023
# Rationale:   Load eqtlGen data to add allele frequency
#==================================================================

library(tidyverse)

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen"

freq = read_tsv("/data/gen1/LF_HRC_transethnic/eQTL/eQTLgen/2018-07-18_SNP_AF_for_AlleleB_combined_allele_counts_and_MAF_pos_added.txt.gz", 
                col_select=c("hg19_chr", "hg19_pos", "AlleleA", "AlleleB", "AlleleB_all")) %>%
  mutate(ID = paste0(hg19_chr, ":", hg19_pos, "_", pmin(AlleleA, AlleleB), "_", pmax(AlleleA, AlleleB))) %>%
  select(ID, AssessedAllele_freq = AlleleB_all)

eqtl = read_tsv("/data/gen1/reference/eqtlgen/2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz") %>%
  mutate(ID = paste0(SNPChr, ":", SNPPos, "_", pmin(AssessedAllele, OtherAllele), "_", pmax(AssessedAllele, OtherAllele))) %>%
  relocate(ID, .before="SNP") %>%
  left_join(freq) %>%
  mutate(beta = Zscore / sqrt(2 * AssessedAllele_freq * (1 - AssessedAllele_freq) * (NrSamples + Zscore^2)), 
         se   = 1 / sqrt(2 * AssessedAllele_freq * (1 - AssessedAllele_freq) * (NrSamples + Zscore^2)))

write_tsv(x=eqtl, file=paste0(tmp_path,"/whole_blood_cis_eqtls_withAF.txt.gz"))

src/coloc/001_submit_eqtl_lookup_GTEx.sh

#!/bin/bash

#SBATCH --job-name=SA_eqtl_lookup
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=16gb
#SBATCH --mail-type=FAIL
#SBATCH --mail-user=nnp5@le.ac.uk
#SBATCH --account=gen1
#SBATCH --export=NONE

#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"

module load R
Rscript ./src/coloc/001_run_eqtl_lookup_GTEx.R $TISSUE $CREDSET

src/coloc/001_run_eqtl_lookup_GTEx.R

#!/usr/bin/env Rscript

#Rationale: Create GTExv8 eQTL files to use for colocalisation, filter genes for each eQTL files, find GWAS sumstat lookup
#in GTEXv8 for regions that I cannot do colocalisation, e.i. HLA regions.

library(data.table)
library(tidyverse)

args     = commandArgs(trailingOnly = TRUE)
tissue   = args[1]
cred_set = args[2]

tmp      = unlist(strsplit(cred_set, split="_"))
chr      = as.numeric(tmp[2])
location = as.numeric(tmp[3])

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"

##allele are place in alphabetical order ! (no dependency on effect allele, reference/alternative)

#eqtl:
#'Colon_Transverse' 'Colon_Sigmoid' 'Skin_Sun_Exposed_Lower_leg' 'Skin_Not_Sun_Exposed_Suprapubic' files are in my scratch;
#make an exeption for these files:
my_tissues <- c("Colon_Transverse", "Colon_Sigmoid", "Skin_Sun_Exposed_Lower_leg", "Skin_Not_Sun_Exposed_Suprapubic")
if (tissue %in% my_tissues ){
    eqtl = fread(paste0(tmp_path,"liftover_gtexv8/", tissue, ".v8.EUR.allpairs.chr", chr, ".hg19.txt.gz"),
             select=c("phenotype_id", "CHROM", "POS", "REF", "ALT", "maf", "ma_samples", "slope", "slope_se", "pval_nominal"),
             col.names=c("gene_id", "chrom", "pos", "ref", "alt", "maf", "N", "beta", "se", "pval"),
             showProgress=FALSE) %>%
        mutate(ID = paste0(chrom, ":", pos, "_", pmin(ref, alt), "_", pmax(ref, alt))) %>%
        relocate(ID, .before="gene_id") %>%
        filter(between(pos, location-5e5, location+5e5), chrom==chr)
    } else {
eqtl = fread(paste0("/data/gen1/ACEI/colocalisation_datasets/eQTL/GTeX/", tissue, ".v8.EUR.allpairs.chr", chr, ".hg19.txt.gz"), 
             select=c("phenotype_id", "CHROM", "POS", "REF", "ALT", "maf", "ma_samples", "slope", "slope_se", "pval_nominal"), 
             col.names=c("gene_id", "chrom", "pos", "ref", "alt", "maf", "N", "beta", "se", "pval"), 
             showProgress=FALSE) %>%
  mutate(ID = paste0(chrom, ":", pos, "_", pmin(ref, alt), "_", pmax(ref, alt))) %>%
  relocate(ID, .before="gene_id") %>%
  filter(between(pos, location-5e5, location+5e5), chrom==chr)
}

genes = eqtl %>% distinct(gene_id)

write_delim(x=eqtl, file=paste0(tmp_path, cred_set, "_", tissue, "_pairs.txt.gz"))
write_delim(x=genes, file=paste0(tmp_path, cred_set, "_", tissue, "_genes.txt"), col_names=FALSE)

#import gwas sumstat:
gwas_sumstat <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat")
gwas_sumstat <- gwas_sumstat %>% setnames(c("b37chr","bp","a1","a2"),c("chrom","position","allele1","allele2"))

#credible set:
cs <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/", cred_set))
cs <- cs %>% setnames("chromosome","chrom")

#merge credible set with gwas sumstat:
cs_gwas_sumstat <- inner_join(cs,gwas_sumstat, by=c("chrom","position","allele1","allele2","snpid"))

#merge credible set-gwas sumstat with eqtl:
cs_gwas_sumstat_eqtl <- cs_gwas_sumstat %>% setnames(c("position","LOG_ODDS"),c("pos","beta")) %>%
  mutate(ID = paste0(chrom, ":", pos, "_", pmin(allele1, allele2), "_", pmax(allele1, allele2))) %>%
  relocate(ID, .before="chrom") %>%
  distinct() %>%
  left_join(eqtl, by="ID", suffix=c(".gwas", ".eqtl")) %>%
  mutate(beta.eqtl = if_else(alt == allele1, beta.eqtl, -beta.eqtl))

write_delim(x=cs_gwas_sumstat_eqtl, file=paste0(tmp_path, cred_set, "_", tissue, "_lookup.txt.gz"))

src/coloc/002_prepare_LDinput_eqtlgen.R

#!/usr/bin/env Rscript

#Rationale: combine names in a document to create LD .raw file for eQTLGen

library(tidyverse)

args     = commandArgs(trailingOnly = TRUE)
tissue = args[1]

#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen/"

df <- data.frame(cred_set = c("SA_8_81292599_C_A","SA_6_90963614_AT_A","SA_5_110401872_C_T",
    "SA_2_242692858_T_C","SA_15_67442596_T_C","SA_12_56435504_C_G","SA_11_76296671_G_A",
    "SA_9_6209697_G_A","SA_5_131885240_C_G","SA_3_33042712_C_T",
    "SA_2_102913642_AAAAC_A","SA_17_38168828_G_A","SA_16_27359021_C_A","SA_15_61068954_C_T",
    "SA_12_48202941_T_C","SA_10_9064716_C_T")) %>%
  as_tibble %>%
  separate(cred_set, c("pheno", "chr", "pos", "a1", "a2"), sep="_")

df$ancestry <- "EUR"

df <- df %>%
  unite("snp", chr:a2, remove=FALSE) %>%
  mutate(eQTL  = tissue,
         start = as.numeric(pos) - 5e5,
         end   = as.numeric(pos) + 5e5) %>%
  select(pheno, ancestry, snp, chr, pos, eQTL, start, end)

write_tsv(x=df, file=paste0(tmp_path,tissue,"_Pairs_lookup.txt"),col_names=F, append=T)

src/coloc/003_submit_coloc_susie_eQTLGen.sh

#!/bin/bash

#SBATCH --job-name=coloc_susie_eqtlgen
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=32gb
#SBATCH --account=gen1
#SBATCH --export=NONE

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen/"

i=$SLURM_ARRAY_TASK_ID
read GENE < <(awk -F'\t' 'NR == '$((i))' { print $1 }' ${tmp_path}${CREDSET}_eqtlGenWB_genes.txt)

module load R

Rscript src/coloc_UBClung/003_run_coloc_susie_lung_eQTL.r "eqtlGenWB" $CREDSET $GENE

src/coloc/003_run_coloc_susie_eQTLGen.R

#!/usr/bin/env Rscript

#Rationale: Run coloc and coloc.susie in eQTLGen wholeblood tissue

suppressMessages(library(tidyverse))
library(coloc)
suppressMessages(library(Rfast))

args     = commandArgs(trailingOnly = TRUE)
tissue   = args[1]
cred_set = args[2]
gene     = args[3]

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/eqtlgen/"

cred_set %>%
  as_tibble %>%
  separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
  select(pheno) %>%
  pull -> pheno

cred_set %>%
  as_tibble %>%
  separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
  unite("signal", chr:a2) %>%
  pull -> signal

############################
## Read asthma GWAS sumstat, eQTL GWAS, and LD matrix
############################
GWAS = read_delim(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/", cred_set, "_GWASpairs.txt.gz")) %>%
  rename(chr = b37chr, pos = bp, beta = LOG_ODDS, SE = se) %>%
  select(-snpid) %>%
  arrange(chr, pos) %>%
  drop_na(beta) %>%
  mutate(varbeta = SE^2) %>%
  rename(position=pos)
GWAS$allele1.gwas <- GWAS$a1
GWAS <- GWAS %>% mutate(snp = paste0(chr, "_", position, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
  select(c(snp,beta,SE,eaf,pval,MAF,varbeta,allele1.gwas)) %>%
  distinct(snp, .keep_all=TRUE)

GWAS$N <- as.numeric(38405)

eqtlGWAS = read_delim(paste0(tmp_path, cred_set, "_eqtlGenWB_pairs.txt.gz")) %>%
  mutate(ID      = gsub(x=ID, pattern=":", replacement="_"), 
         varbeta = se^2, 
         MAF     = if_else(AssessedAllele_freq <0.5, AssessedAllele_freq, 1-AssessedAllele_freq)) %>% 
  arrange(chrom, pos)  %>%
  drop_na(beta) %>%
  filter(GeneSymbol==gene, 
         ID %in% GWAS$snp) %>%
  rename(snp=ID, position=pos, N=NrSamples)

GWAS %>% filter(snp %in% eqtlGWAS$snp) -> GWAS


############################
#Do colocalisation ONLY IF GWAS and eqtlGWAS (eQTL tissue-gene) contains pvalue <= 5x10-6.
############################
GWAS_sign <- GWAS %>% filter(pval <= 0.000005)
eqtlGWAS_sign <- eqtlGWAS %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", gene, ": No colocalisation is possible because eQTL data has pvalue >= 5x10-6."))
}
if (dim(GWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", gene, ": No colocalisation is possible because GWAS data has pvalue >= 5x10-6."))
} else {
    paste0(signal," ",tissue, " ", gene, ": Starting colocalisation because both GWAS and eQTL data has pvalue <= 5x10-6.")
}

############################
# * if also want coloc results
############################
coloc_all = coloc.abf(dataset1=list(beta=GWAS$beta, varbeta=GWAS$varbeta, 
                                    N=GWAS$N, type="cc", MAF=GWAS$MAF, snp=GWAS$snp), 
                      dataset2=list(beta=eqtlGWAS$beta, varbeta=eqtlGWAS$varbeta, 
                                    N=eqtlGWAS$N, type="quant", MAF=eqtlGWAS$MAF, snp=eqtlGWAS$snp))

saveRDS(coloc_all, paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/eqtlgen/", cred_set, "_eqtlGenWB_", gene, "_all_coloc.rds"))

############################
# 1) Format LD matrix
############################
ld = data.table::fread(
  paste0(tmp_path, signal, "_", pheno, "_", tissue, ".raw")
) %>% 
  select(!c(FID:PHENOTYPE))

ld %>% 
  names %>% 
  as_tibble %>% 
  separate(value,c("c", "p", "a1", "a2")) %>% 
  mutate(snp = paste0(c, "_", p, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
  select(snp) %>%
  pull -> snps

ld = t(ld) %>%
  as_tibble %>%
  mutate(var=apply(., 1, var, na.rm=TRUE),
         snp=snps) %>%
  filter(var!=0) %>%
  filter(snp %in% GWAS$snp) %>%
  filter(snp %in% eqtlGWAS$snp)

N = ncol(ld)-2
X = t(as.matrix(ld[,1:N]))
colnames(X) = ld %>%
  select(snp) %>%
  pull # Sdd names to facilitate filtering Nas (still some NAs even with pairwise.complete.obs)
LDmatrix = cor(X, use="pairwise.complete.obs") # Pearson's correlation, takes some time...
LDmatrix[is.na(LDmatrix)] <- 0

############################
# 2) Format asthma GWAS
############################
bim = read_tsv(
    paste0(tmp_path, signal, "_", pheno, "_", tissue, ".bim"),
    col_names=c("chr", "snp", "morgans", "position", "allele1", "allele2")
  ) %>%
  select(!morgans) # Read bim file to check same set of SNPs and allele alignment

GWAS_df <- GWAS %>%
  filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
  inner_join(bim) %>%
  mutate(beta=ifelse(allele1!=allele1.gwas, -(beta), beta)) # Check allele alignment

#Check GWAS has at least one variant with pval <= 0.000005:
GWAS_sign <- GWAS_df %>% filter(pval <= 0.000005)
if (dim(GWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No coloc.susie is possible because GWAS data has pvalue >= 5x10-6."))
}

GWAS_df %>%
  as.list -> GWAS

GWAS$type = "cc"
N = as.integer(mean(GWAS$N))
GWAS$N = N
GWAS$LD = LDmatrix

############################
# 3) Format eQTL GWAS  
############################
eqtlGWAS_df <- eqtlGWAS  %>%
  filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
  inner_join(bim) %>%
  mutate(beta=ifelse(allele1!=AssessedAllele, -(beta), beta)) # Check allele alignment

#Check eqtlGWAS has at least one variant with pval <= 0.000005:
eqtlGWAS_sign <- eqtlGWAS_df %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", gene, " GTExv8: No coloc.susie is possible because eQTL-GWAS data has pvalue >= 5x10-6."))
} else {
    paste0(signal," ",tissue, " ", gene, " GTExv8: Starting coloc.susie because both GWAS and eQTL data has pvalue <= 5x10-6.")
}

eqtlGWAS <- eqtlGWAS_df %>% as.list

eqtlGWAS$type = "quant"
eqtlGWAS$LD = LDmatrix
N = as.integer(mean(eqtlGWAS$N))
eqtlGWAS$N = N

############################
# Check datasets are ok
############################
check_dataset(eqtlGWAS, req="LD") -> eqtlGWAS_check
check_dataset(GWAS, req="LD") -> GWAS_check

if (is.null(GWAS_check) & is.null(eqtlGWAS_check)){
  cat(paste0(cred_set, "_", tissue, "_", gene, ": ok", "\n"), 
      file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
} else {
  cat(paste0(cred_set, "_", tissue, "_", gene, ": warning", "\n"), 
      file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
}

############################
## Run SuSie
############################
susie_GWAS = runsusie(GWAS, r2.prune=0.2, check_R=FALSE)
susie_eQTLGWAS = runsusie(eqtlGWAS, r2.prune=0.2, check_R=FALSE, verbose=TRUE)
susie_all = coloc.susie(susie_GWAS, susie_eQTLGWAS)

saveRDS(susie_all, paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/eqtlgen/", cred_set, "_", tissue, "_", gene, "_all_susie.rds"))
write_tsv(susie_all$summary %>% as_tibble,
          paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/eqtlgen/", cred_set, "_", tissue, "_", gene, "_susie_summary.tsv"))

src/coloc/004_concat_coloc_results.R

#!/usr/bin/env Rscript

#Rationale: Collate results together and extract significant Variant/Locus-Tissue-Gene colocalisation, from GTExV8 and eQTLGen

library(plyr)
library(tidyverse)
library(purrr)
library(xlsx)

###############
### GTEx v8 ###
###############
print("GTExV8 eQTL coloc results")
# .rds files containing standard coloc results with eQTLs
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/gtex/")
file_paths_gtex = list.files(pattern="_all_coloc.rds")

# Manipulate file names to get various bits of info
file_paths_gtex %>%
  as_tibble %>%
  mutate(file = gsub(x=value, pattern="_all_coloc.rds", replacement="")) %>%
  separate(col=file, 
           into=c("pheno", "chr", "pos", "a1", "a2", "t1", "t2", "t3", "t4", "t5", "t6"),
           sep="_", 
           remove=FALSE) -> file_split

# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
gene = sapply(tmp, tail, 1)
tissue_fct = function(df) {
  x = paste0(df[7:(length(df)-1)], sep="_", collapse="")
  x = tolower(substr(x, 1, nchar(x)-1))
}
tissue = sapply(tmp, tissue_fct)

# Add gene and tissue to above dataframe
file_split %>%
  mutate(gene   = gene,
         tissue = tissue) %>%
  unite("snp", chr:a2) %>%
  select(file=value, pheno, snp, gene, tissue) -> file_split

# Read all .rds files
data_list = lapply(file_paths_gtex, readRDS)

# Function to extract coloc information from results
summary_fct = function(df) {
  return(df$summary)
}

# Collapse list of coloc results into single dataframe and add a few new columns
data_summary = ldply(lapply(data_list, summary_fct)) %>%
  as_tibble %>%
  bind_cols(file_split) %>%
  select(-file) %>%
  mutate(coloc = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
  arrange(desc(PP.H4.abf))

# Check if there are any colocalisations
data_summary %>% filter(coloc) %>% select(snp, pheno, gene)


# Save all results to file
print("GTExV8 eQTL coloc.susie results")
write_tsv(x=data_summary, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/coloc_asthma_GTEx.tsv")

# .rds files containing coloc.susie results with eQTLs
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/gtex/")
file_paths_gtex = list.files(pattern="_all_susie.rds")

# Manipulate file names to get various bits of info
file_paths_gtex %>%
  as_tibble %>%
  mutate(file = gsub(x=value, pattern="_all_susie.rds", replacement="")) %>%
  separate(col=file,
           into=c("pheno", "chr", "pos", "a1", "a2", "t1", "t2", "t3", "t4", "t5", "t6"),
           sep="_",
           remove=FALSE) -> file_split

# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
gene = sapply(tmp, tail, 1)
tissue_fct = function(df) {
  x = paste0(df[7:(length(df)-1)], sep="_", collapse="")
  x = tolower(substr(x, 1, nchar(x)-1))
}
tissue = sapply(tmp, tissue_fct)

# Add gene and tissue to above dataframe
file_split %>%
  mutate(gene   = gene,
         tissue = tissue) %>%
  unite("snp", chr:a2) %>%
  select(file=value, pheno, snp, gene, tissue) -> file_split

# Read all .rds files
data_list = lapply(file_paths_gtex, readRDS)

# Function to extract coloc.susie information from results
summary_fct = function(df) {
  return(df$summary)
}

# Collapse list of coloc results into single dataframe and add a few new columns
##Additional commands to work around summary with NULL data:
tmp_1 <- lapply(data_list, summary_fct)
nonnull <- sapply(tmp_1, typeof)!="NULL"  # find all NULLs to omit
ID <- file_split$snp
index_row <- 1:937 #the length of tmp_1 which is the total nuber of all_susie.rds files
tmp_2 <- map2(tmp_1, ID, ~cbind(.x, snp = .y))
tmp_3 <- map2(tmp_2, index_row, ~cbind(.x, n_index = .y))
tmp_4 <- tmp_3[nonnull]

file_split$n_index <- index_row

data_summary_colocsusie = ldply(tmp_4) %>%
  as_tibble %>%
  left_join(file_split,by=c("snp","n_index")) %>%
  select(-file) %>%
  mutate(coloc_susie = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
  arrange(desc(PP.H4.abf))

# Check if there are any colocalisations
data_summary_colocsusie %>% filter(coloc_susie) %>% select(snp, pheno, gene)


# Save all results to file
write_tsv(x=data_summary_colocsusie, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/colocsusie_asthma_GTEx.tsv")

#SAVE COLOC AND COLOC.SUSIE GENES INTO XLSX FILE:
gene_coloc <- as.data.frame(data_summary %>% filter(coloc) %>% select(gene))
gene_colocsusie <- as.data.frame(data_summary_colocsusie %>% filter(coloc_susie) %>% select(gene))
gene_gtex <- rbind(gene_coloc,gene_colocsusie) %>% unique()
write.xlsx(gene_gtex,"/alice-home/3/n/nnp5/PhD/PhD_project/Var_to_Gene/input/var2genes_raw.xlsx", sheetName = "GTExV8_eQTL_genes", row.names=FALSE, col.names=FALSE, append=TRUE)

###############
### eqtlGen ###
###############
print("eqtlGen eQTL coloc results")
# .rds files containing standard coloc results with eQTLs
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/eqtlgen/")
file_paths_eqtlgen = list.files(pattern="_all_coloc.rds")

# Manipulate file names to get various bits of info
file_paths_eqtlgen %>%
  as_tibble %>%
  mutate(file = gsub(x=value, pattern="_all_coloc.rds", replacement="")) %>%
  separate(col=file,
           into=c("pheno", "chr", "pos", "a1", "a2", "t1"),
           sep="_",
           remove=FALSE) -> file_split

# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
gene = sapply(tmp, tail, 1)
tissue_fct = function(df) {
  x = paste0(df[6])
}
tissue = sapply(tmp, tissue_fct)

# Add gene and tissue to above dataframe
file_split %>%
  mutate(gene   = gene,
         tissue = tissue) %>%
  unite("snp", chr:a2) %>%
  select(file=value, pheno, snp, gene, tissue) -> file_split

# Read all .rds files
data_list = lapply(file_paths_eqtlgen, readRDS)

# Function to extract coloc information from results
summary_fct = function(df) {
  return(df$summary)
}

# Collapse list of coloc results into single dataframe and add a few new columns
data_summary = ldply(lapply(data_list, summary_fct)) %>%
  as_tibble %>%
  bind_cols(file_split) %>%
  select(-file) %>%
  mutate(coloc = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
  arrange(desc(PP.H4.abf))

# Check if there are any colocalisations
data_summary %>% filter(coloc) %>% select(snp, pheno, gene)

# Save all results to file
write_tsv(x=data_summary, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/coloc_asthma_eqtlgen.tsv")

# .rds files containing coloc.susie results with eQTLs
file_paths_eqtlgen= list.files(pattern="_all_susie.rds")

# Manipulate file names to get various bits of info
file_paths_eqtlgen %>%
  as_tibble %>%
  mutate(file = gsub(x=value, pattern="_all_susie.rds", replacement="")) %>%
  separate(col=file,
           into=c("pheno", "chr", "pos", "a1", "a2", "t1"),
           sep="_",
           remove=FALSE) -> file_split

# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
gene = sapply(tmp, tail, 1)
tissue_fct = function(df) {
  x = paste0(df[6])
}
tissue = sapply(tmp, tissue_fct)

# Add gene and tissue to above dataframe
file_split %>%
  mutate(gene   = gene,
         tissue = tissue) %>%
  unite("snp", chr:a2) %>%
  select(file=value, pheno, snp, gene, tissue) -> file_split

# Read all .rds files
data_list = lapply(file_paths_eqtlgen, readRDS)

# Function to extract coloc.susie information from results
summary_fct = function(df) {
  return(df$summary)
}

# Collapse list of coloc results into single dataframe and add a few new columns
##Additional commands to work around summary with NULL data:
tmp_1 <- lapply(data_list, summary_fct)
nonnull <- sapply(tmp_1, typeof)!="NULL"  # find all NULLs to omit
ID <- file_split$snp
index_row <- 1:169 #the length of tmp_1 which is the total nuber of all_susie.rds files
tmp_2 <- map2(tmp_1, ID, ~cbind(.x, snp = .y))
tmp_3 <- map2(tmp_2, index_row, ~cbind(.x, n_index = .y))
tmp_4 <- tmp_3[nonnull]

file_split$n_index <- index_row

data_summary_colocsusie = ldply(tmp_4) %>%
  as_tibble %>%
  left_join(file_split,by=c("snp","n_index")) %>%
  select(-file) %>%
  mutate(coloc_susie = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
  arrange(desc(PP.H4.abf))

# Check if there are any colocalisations
data_summary_colocsusie %>% filter(coloc_susie) %>% select(snp, pheno, gene)


# Save all results to file
write_tsv(x=data_summary_colocsusie, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/colocsusie_asthma_eqtlgen.tsv")

#SAVE COLOC AND COLOC.SUSIE GENES INTO XLSX FILE:
#coloc.susie does not have any significant colocalisation:
gene_coloc <- as.data.frame(data_summary %>% filter(coloc) %>% select(gene))
write.xlsx(gene_coloc,"/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/var2genes_raw.xlsx", sheetName = "eqtlGen_eQTL_genes", row.names=FALSE, col.names=FALSE, append=TRUE)

src/coloc_UBClung/000_submit_lookup_lung_eQTL.sh

#!/bin/bash

#SBATCH --job-name=lookup_lungeqtl
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=1:0:0
#SBATCH --mem=30gb
#SBATCH --account=gen1
#SBATCH --export=NONE

module load R

Rscript src/coloc_UBClung/000_run_lookup_lung_eQTL.r "UBCLung" $CREDSET

#To be submitted as:
#for c in ${!cs[*]}; do
#
#  sbatch --export=CREDSET="${cs[c]}" ./src/coloc_UBClung/000_submit_lookup_lung_eQTL.sh
#
#done

src/coloc_UBClung/000_run_lookup_lung_eQTL.r

#!/usr/bin/env Rscript

#Rationale: Lookup of overlapping region between ssevere asthma GWAS and UBC Lung eQTL significant genes in order to
#perform, colocalisation only in regions that have significant associations in both studies.

sink(stderr())

suppressMessages(library(data.table))
suppressMessages(library(tidyverse))
suppressMessages(library(dplyr))

args     = commandArgs(trailingOnly = TRUE)
tissue   = args[1] #"UBCLung"
cred_set = args[2]

tmp      = unlist(strsplit(cred_set, split="_"))
chr_sentinel      = as.numeric(tmp[2])
location_sentinel = as.numeric(tmp[3])

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/"

##allele are place in alphabetical order ! (no dependency on effect allele, reference/alternative)

eGene <- fread("/data/gen1/LF_HRC_transethnic/eQTL/lung_eQTL/eGene_list.txt")
eGene$chr <- gsub("_[0-9]*_[A-Z]*_[A-Z]*","",eGene$SNP)
eGene$bp <- gsub("^[0-9]*_","",eGene$SNP)
eGene$bp <- gsub("_[A-Z]*_[A-Z]*","",eGene$bp)

eGENE_lookup <- eGene %>% filter(chr == chr_sentinel,
                                  bp >= location_sentinel-1000000,
                                  bp <= location_sentinel+1000000)
eGENE_lookup <-  eGENE_lookup %>% mutate(sentinel=cred_set)
probeset <- eGENE_lookup %>% distinct(ProbeSet)

#colnames: SNP ProbeSet p_value BF TESTS chr bp sentinel
write_delim(x=eGENE_lookup, file=paste0(tmp_path, cred_set,"_", tissue, "_pairs.txt.gz"))
write_delim(x=probeset, file=paste0(tmp_path, cred_set,"_", tissue, "_probesets.txt"), col_names=FALSE)

src/coloc_UBClung/002_prepare_LDinput_ubclung.R

#!/usr/bin/env Rscript

#Rationale: combine names in a document to create LD .raw file for UBCLung

library(tidyverse)

args     = commandArgs(trailingOnly = TRUE)
tissue = args[1]

#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/"

df <- data.frame(cred_set = c("SA_8_81292599_C_A","SA_6_90963614_AT_A","SA_5_110401872_C_T",
    "SA_2_242692858_T_C","SA_15_67442596_T_C","SA_12_56435504_C_G","SA_11_76296671_G_A",
    "SA_9_6209697_G_A","SA_5_131885240_C_G","SA_3_33042712_C_T",
    "SA_2_102913642_AAAAC_A","SA_17_38168828_G_A","SA_16_27359021_C_A","SA_15_61068954_C_T",
    "SA_12_48202941_T_C","SA_10_9064716_C_T")) %>%
  as_tibble %>%
  separate(cred_set, c("pheno", "chr", "pos", "a1", "a2"), sep="_")

df$ancestry <- "EUR"

df <- df %>%
  unite("snp", chr:a2, remove=FALSE) %>%
  mutate(eQTL  = tissue,
         start = as.numeric(pos) - 5e5,
         end   = as.numeric(pos) + 5e5) %>%
  select(pheno, ancestry, snp, chr, pos, eQTL, start, end)

write_tsv(x=df, file=paste0(tmp_path,tissue,"_Pairs_lookup.txt"),col_names=F, append=T)

src/coloc_UBClung/003_submit_coloc_susie_lung_eQTL.sh

#!/bin/bash

#SBATCH --job-name=coloc_susie_UBCLung
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=0:30:0
#SBATCH --mem=32gb
#SBATCH --account=gen1
#SBATCH --export=NONE

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/"

i=$SLURM_ARRAY_TASK_ID
read PROBESET < <(awk -F'\t' 'NR == '$((i))' { print $1 }' ${tmp_path}${CREDSET}_UBCLung_probesets.txt)

module load R

Rscript src/coloc_UBClung/003_run_coloc_susie_lung_eQTL.r "UBCLung" $CREDSET $PROBESET

src/coloc_UBClung/003_run_ coloc_susie_lung_eQTL.r

#!/usr/bin/env Rscript

#Rationale: Run coloc and coloc.susie in UBCLung eQTL

suppressMessages(library(tidyverse))
library(coloc)
suppressMessages(library(Rfast))
library(data.table)

args     = commandArgs(trailingOnly = TRUE)
tissue   = args[1]
cred_set = args[2]
probe    = args[3]

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ubclung/"

cred_set %>%
  as_tibble %>%
  separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
  select(pheno) %>%
  pull -> pheno

cred_set %>%
  as_tibble %>%
  separate(value, c("pheno", "chr", "pos", "a1", "a2"), sep="_") %>%
  unite("signal", chr:a2) %>%
  pull -> signal

##allele are place in alphabetical order ! (no dependency on effect allele, reference/alternative)

############################
## Read asthma GWAS sumstat, eQTL GWAS, and LD matrix
############################
GWAS = read_delim(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/", cred_set, "_GWASpairs.txt.gz")) %>%
  rename(chr = b37chr, pos = bp, beta = LOG_ODDS, SE = se) %>%
  select(-snpid) %>%
  arrange(chr, pos) %>%
  drop_na(beta) %>%
  mutate(varbeta = SE^2) %>%
  rename(position=pos)
GWAS$allele1.gwas <- GWAS$a1
GWAS <- GWAS %>% mutate(snp = paste0(chr, "_", position, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
  select(c(snp,beta,SE,eaf,pval,MAF,varbeta,allele1.gwas)) %>%
  distinct(snp, .keep_all=TRUE)

GWAS$N <- as.numeric(38405)

eQTL <- fread(paste0(tmp_path,"eQTL_region_stat/",cred_set,"_eQTL_region.txt",sep=""),header=T)
setnames(eQTL,"#Probe","ProbeSet")
eQTL <- eQTL %>% filter(ProbeSet==probe)
eQTL <- mutate(eQTL,snp=paste(CHR,BP,pmin(toupper(Allele1),toupper(Allele2)),pmax(toupper(Allele1),toupper(Allele2)),sep="_"))
eqtlGWAS <- eQTL %>% filter(snp %in% GWAS$snp)

setnames(eqtlGWAS,"Freq1","freq")
setnames(eqtlGWAS,"Effect","b")
setnames(eqtlGWAS,"StdErr","se")
setnames(eqtlGWAS,"P","pval")

eqtlGWAS$freq <- as.numeric(eqtlGWAS$freq)
eqtlGWAS$b <- as.numeric(eqtlGWAS$b)
eqtlGWAS$se <- as.numeric(eqtlGWAS$se)
eqtlGWAS$z <- eqtlGWAS$b/eqtlGWAS$se
eqtlGWAS$N <- 1/(2*eqtlGWAS$freq*(1-eqtlGWAS$freq)*eqtlGWAS$se*eqtlGWAS$se)-eqtlGWAS$z*eqtlGWAS$z
N = as.integer(mean(eqtlGWAS$N))
eqtlGWAS$N = N
eqtlGWAS <- mutate(eqtlGWAS,MAF=ifelse(freq>0.5,1-freq,freq))
eqtlGWAS <- mutate(eqtlGWAS,beta=ifelse(freq>0.5,-1*b,b))
eqtlGWAS <- eqtlGWAS %>% mutate(varbeta = se^2)
eqtlGWAS$allele1.eqtl <- toupper(eqtlGWAS$Allele1)
eqtlGWAS <- select(eqtlGWAS,"snp","MAF","beta","se","varbeta","pval","allele1.eqtl","N")

GWAS <- GWAS %>% filter(snp %in% eqtlGWAS$snp)

############################
# * if also want coloc results
############################
coloc_all = coloc.abf(dataset1=list(beta=GWAS$beta, varbeta=GWAS$varbeta,
                                    N=GWAS$N, type="cc", MAF=GWAS$MAF, snp=GWAS$snp),
                      dataset2=list(beta=eqtlGWAS$beta, varbeta=eqtlGWAS$varbeta,
                                    N=eqtlGWAS$N, type="quant", MAF=eqtlGWAS$MAF, snp=eqtlGWAS$snp))

saveRDS(coloc_all, paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/ubclung/", cred_set, "_ubclung_", probe, "_all_coloc.rds"))


########################
############################
# 1) Format LD matrix
############################
ld = data.table::fread(
  paste0(tmp_path, signal, "_", pheno, "_", tissue, ".raw")
) %>%
  select(!c(FID:PHENOTYPE))

ld %>%
  names %>%
  as_tibble %>%
  separate(value,c("c", "p", "a1", "a2")) %>%
  mutate(snp = paste0(c, "_", p, "_", pmin(a1, a2), "_", pmax(a1, a2))) %>%
  select(snp) %>%
  pull -> snps

ld = t(ld) %>%
  as_tibble %>%
  mutate(var=apply(., 1, var, na.rm=TRUE),
         snp=snps) %>%
  filter(var!=0) %>%
  filter(snp %in% GWAS$snp) %>%
  filter(snp %in% eqtlGWAS$snp)

N = ncol(ld)-2
X = t(as.matrix(ld[,1:N]))
colnames(X) = ld %>%
  select(snp) %>%
  pull # Sdd names to facilitate filtering Nas (still some NAs even with pairwise.complete.obs)
LDmatrix = cor(X, use="pairwise.complete.obs") # Pearson's correlation, takes some time...
LDmatrix[is.na(LDmatrix)] <- 0

############################
# 2) Format asthma GWAS
############################
bim = read_tsv(
    paste0(tmp_path, signal, "_", pheno, "_", tissue, ".bim"),
    col_names=c("chr", "snp", "morgans", "position", "allele1", "allele2")
  ) %>%
  select(!morgans) # Read bim file to check same set of SNPs and allele alignment

GWAS_df <- GWAS %>%
  filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
  inner_join(bim) %>%
  mutate(beta=ifelse(allele1!=allele1.gwas, -(beta), beta)) # Check allele alignment

#Check GWAS has at least one variant with pval <= 0.000005:
GWAS_sign <- GWAS_df %>% filter(pval <= 0.000005)
if (dim(GWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", probe, " UBClung: No coloc.susie is possible because GWAS data has pvalue >= 5x10-6."))
}

GWAS_df %>%
  as.list -> GWAS

GWAS$type = "cc"
N = as.integer(mean(GWAS$N))
GWAS$N = N
GWAS$LD = LDmatrix

############################
# 3) Format eQTL GWAS
############################
eqtlGWAS_df <- eqtlGWAS  %>%
  filter(snp %in% colnames(LDmatrix)) %>% # Must be the same set of SNPs
  inner_join(bim) %>%
  mutate(beta=ifelse(allele1!=allele1.eqtl, -(beta), beta)) # Check allele alignment

#Check eqtlGWAS has at least one variant with pval <= 0.000005:
eqtlGWAS_sign <- eqtlGWAS_df %>% filter(pval <= 0.000005)
if (dim(eqtlGWAS_sign)[1] < 1){
    stop(paste0(signal," ",tissue, " ", probe, " UBClung: No coloc.susie is possible because eQTL-GWAS data has pvalue >= 5x10-6."))
} else {
    paste0(signal," ",tissue, " ", probe, " UBClung: Starting coloc.susie because both GWAS and eQTL data has pvalue <= 5x10-6.")
}

eqtlGWAS <- eqtlGWAS_df %>% as.list

eqtlGWAS$type = "quant"
eqtlGWAS$LD = LDmatrix
N = as.integer(mean(eqtlGWAS$N))
eqtlGWAS$N = N

############################
# Check datasets are ok
############################
check_dataset(eqtlGWAS, req="LD") -> eqtlGWAS_check
check_dataset(GWAS, req="LD") -> GWAS_check

if (is.null(GWAS_check) & is.null(eqtlGWAS_check)){
  cat(paste0(cred_set, "_", tissue, "_", probe, ": ok", "\n"),
      file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
} else {
  cat(paste0(cred_set, "_", tissue, "_", probe, ": warning", "\n"),
      file=paste0(tmp_path,"GWAS_coloc_susie_check.txt"), append=TRUE)
}

############################
## Run SuSie
############################
susie_GWAS = runsusie(GWAS, r2.prune=0.2, check_R=FALSE)
susie_eQTLGWAS = runsusie(eqtlGWAS, r2.prune=0.2, check_R=FALSE, verbose=TRUE)
susie_all = coloc.susie(susie_GWAS, susie_eQTLGWAS)

saveRDS(susie_all, paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/ubclung/", cred_set, "_", tissue, "_", probe, "_all_susie.rds"))
write_tsv(susie_all$summary %>% as_tibble,
          paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/ubclung/", cred_set, "_", tissue, "_", probe, "_susie_summary.tsv"))

src/coloc_UBClung/004_concat_coloc_susie_ubclung.R

#!/usr/bin/env Rscript

#Rationale: Collate results together and extract significant Variant/Locus-Tissue-Gene colocalisation, from UBCLung

library(plyr)
library(tidyverse)
library(purrr)
library(data.table)


#find probset that colocalised and map to gene using /data/gen1/reference/lung_eQTL/tabMerged_anno.txt

###############
### UBCLung ###
###############
print("UBCLung eQTL coloc results")
# .rds files containing standard coloc results with eQTLs
setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/results/ubclung/")
file_paths_ubclung = list.files(pattern="_all_coloc.rds")

# Manipulate file names to get various bits of info
file_paths_ubclung %>%
  as_tibble %>%
  mutate(file = gsub(x=value, pattern="_all_coloc.rds", replacement="")) %>%
  separate(col=file,
           into=c("pheno", "chr", "pos", "a1", "a2", "t1"),
           sep="_",
           remove=FALSE) -> file_split

# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
probe_1 = sapply(tmp, function(innerList) innerList[[length(innerList) - 1]])
probe_2 = sapply(tmp, tail, 1)
tissue_fct = function(df) {
  x = paste0(df[6])
}
tissue = sapply(tmp, tissue_fct)

# Add gene and tissue to above dataframe
file_split %>%
  mutate(probe_1   = probe_1,
         probe_2   = probe_2,
         tissue = tissue) %>%
  unite("snp", chr:a2) %>%
  select(file=value, pheno, snp, probe_1, probe_2, tissue) -> file_split

file_split$ProbeSet <- paste0(file_split$probe_1,"_",file_split$probe_2)

# Read all .rds files
data_list = lapply(file_paths_ubclung, readRDS)

# Function to extract coloc information from results
summary_fct = function(df) {
  return(df$summary)
}

# Collapse list of coloc results into single dataframe and add a few new columns
data_summary = ldply(lapply(data_list, summary_fct)) %>%
  as_tibble %>%
  bind_cols(file_split) %>%
  select(-file) %>%
  mutate(coloc = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
  arrange(desc(PP.H4.abf))

#map probset to gene:
gene_probe <- fread("/data/gen1/reference/lung_eQTL/tabMerged_anno.txt",header=T)
gene_probe$ProbeSet <- gsub("_at","",gene_probe$ProbeSet)
setnames(gene_probe,"gene","gene_id")
data_summary <- left_join(data_summary, gene_probe,by="ProbeSet")

# Check if there are any colocalisations
data_summary %>% filter(coloc) %>% select(snp, pheno, gene_id)

# Save all results to file
write_tsv(x=data_summary, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/coloc_asthma_ubclung.tsv")

# .rds files containing coloc.susie results with eQTLs
file_paths_ubclung = list.files(pattern="_all_susie.rds")

# Manipulate file names to get various bits of info
file_paths_ubclung %>%
  as_tibble %>%
  mutate(file = gsub(x=value, pattern="_all_susie.rds", replacement="")) %>%
  separate(col=file,
           into=c("pheno", "chr", "pos", "a1", "a2", "t1"),
           sep="_",
           remove=FALSE) -> file_split

# Clunky way to get out gene and tissue names from file names
tmp = strsplit(file_split$file, "_")
probe_1 = sapply(tmp, function(innerList) innerList[[length(innerList) - 1]])
probe_2 = sapply(tmp, tail, 1)
tissue_fct = function(df) {
  x = paste0(df[6])
}
tissue = sapply(tmp, tissue_fct)

# Add gene and tissue to above dataframe
file_split %>%
  mutate(probe_1   = probe_1,
         probe_2   = probe_2,
         tissue = tissue) %>%
  unite("snp", chr:a2) %>%
  select(file=value, pheno, snp, probe_1, probe_2, tissue) -> file_split
file_split$ProbeSet <- paste0(file_split$probe_1,"_",file_split$probe_2)

# Read all .rds files
data_list = lapply(file_paths_ubclung, readRDS)

# Function to extract coloc.susie information from results
summary_fct = function(df) {
  return(df$summary)
}

# Collapse list of coloc results into single dataframe and add a few new columns
##Additional commands to work around summary with NULL data:
tmp_1 <- lapply(data_list, summary_fct)
nonnull <- sapply(tmp_1, typeof)!="NULL"  # find all NULLs to omit
ID <- file_split$snp
index_row <- 1:length(tmp_1) #the length of tmp_1 which is the total nuber of all_susie.rds files
tmp_2 <- map2(tmp_1, ID, ~cbind(.x, snp = .y))
tmp_3 <- map2(tmp_2, index_row, ~cbind(.x, n_index = .y))
tmp_4 <- tmp_3[nonnull]

file_split$n_index <- index_row

data_summary_colocsusie = ldply(tmp_4) %>%
  as_tibble %>%
  left_join(file_split,by=c("snp","n_index")) %>%
  select(-file) %>%
  mutate(coloc_susie = if_else(PP.H4.abf >= 0.9, TRUE, FALSE)) %>%
  arrange(desc(PP.H4.abf))

#map probset to gene:
data_summary_colocsusie <- left_join(data_summary_colocsusie, gene_probe,by="ProbeSet")

# Check if there are any colocalisations
data_summary_colocsusie %>% filter(coloc_susie) %>% select(snp, pheno, gene_id)

# Save all results to file
write_tsv(x=data_summary_colocsusie, file="/scratch/gen1/nnp5/Var_to_Gen_tmp/results/colocsusie_asthma_ubclung.tsv")

#SAVE COLOC AND COLOC.SUSIE GENES INTO XLSX FILE:
#coloc.susie does not have any significant colocalisation:
gene_coloc <- as.data.frame(data_summary %>% filter(coloc) %>% select(gene_id))
gene_coloc_susie <- as.data.frame(data_summary_colocsusie %>% filter(coloc_susie) %>% select(gene_id))
print(gene_coloc)
#NO gene coloc_susie
write.table(gene_coloc,"/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/UBCLungeqtl_var2genes_raw.txt", row.names=FALSE, col.names=FALSE, quote=F)

#then add the genes in the /Var_to_Gene/input/var2genes_raw.xlsx file

src/merge_genes_eqtl_coloc.R

#!/usr/bin/env Rscript

#Rationale: merge genes from eQTL colocalisation analysis

library(tidyverse)
library(data.table)
library(readxl)

genes_raw <- "input/var2genes_raw.xlsx"
gtex <- read_excel(genes_raw, sheet = "GTExV8_eQTL_genes_symbol",col_names = c("ensembl","gene"))
eqtlgen <- read_excel(genes_raw, sheet = "eqtlGen_eQTL_genes",col_names = "gene")
ubclung <- read_excel(genes_raw, sheet = "UBClung_eQTL_genes",col_names = "gene")

gtex$ensembl <- NULL
eqtl_gene <- rbind(gtex,eqtlgen,ubclung) %>% unique()
eqtl_gene$evidence <- as.factor("eQTL")

write_tsv(x=eqtl_gene, file="input/eqtl_gene_merge")

Quantitative trait loci (QTL) allows us to measure a trait, such as RNA expression of genes or protein levels, in a specific tissue and test for association of genes within the interested tissue with genetic variant. In other words, the expression trait is used as a quantitative trait for a genome-wide analysis.
Looking at QTL and GWAS summary statistics, we can use statistical methods -colocalisation- to assess if both study reveal statistically significant association in the same loci for the same variant. If so, we can then declare that the variant we found from the GWAS show evidence of association with a gene in a specific tissue.
Different types of QTL data are available, based on the quantitative trait we study as well as at the type of variant-gene relationship we want to look at. As an example, expression QTL looks at mRNA expression; in addition, if the analysis is only for variants near the gene’s transcription start site (usually +/-1Mb), this QTL is called ‘cis’-eQTL; if the analysis is for variants further away that can have an effect on gene expression via the 3D conformation on DNA, they are called ‘trans’-eQTL. If, instead, we want to investigate the role of the variant on the splicing of the mRNA, we call it ‘s’QTL (cis or trans-sQTL). When analysing protein level, we identify the measure as it ’p’QTL (methylation quantitative measure, ’m’QTL).
## eQTL colocalisation I used cis-QTL data from three datasets: GTExV8, eQTLGen blood, and UBC lung eQTL. I run the analysis if the cis-eQTL showed significant association within the credible set regions, and I included shared variants present in both eQTL and the GWAS summary statistic within +/-500Kb from the associated variant. The colocalisation was performed using coloc (https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1004383) or coloc.susie (http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=34587156) methods.
Here, a little description of each cis-eQTL data and the used parameters.
GTExV8 (https://www.gtexportal.org/)
GTExV8 is a collection of eQTL data for significant variant-gene association in 49 tissues with at least 70 samples (https://www.gtexportal.org/home/methods). I run co-localisation for the following tissues: ’Artery_Aorta’, ‘Artery_Coronary’, ‘Artery_Tibial’, ‘Colon_Sigmoid’, ‘Colon_Transverse’, ‘Esophagus_Gastroesophageal_Junction’, ‘Esophagus_Muscularis’, ‘Lung’, ‘Skin_Not_Sun_Exposed_Suprapubic’, ‘Skin_Sun_Exposed_Lower_leg’,‘Small_Intestine_Terminal_Ileum’, ‘Stomach’.
I used filtered GTExV8 data including variant-gene pairs for each tissue with both GWAS and eQTL pvalues <= 5x10-6.
eQTLgen blood eQTLs (http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=34475573)
eQTLgen blood eQTLs is a source for 16,989 eQTL genes from 37 dataset and a total of 31,684 individuals, counting 11M SNPs with MAF >=1% and within 1Mb from the analysed genes (https://www.eqtlgen.org/phase1.html). I used filtered eQTLGen blood variant-gene pairs with both GWAS or eQTL pvalues <= 5x10-6.
UBC lung eQTL (http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=PubMed&dopt=Abstract&list_uids=23209423)
UBC Lung eQTL is a collection of lung specimens from 1,111 individuals of European ancestry including 17,178 cis- and 593 trans- lung eQTLs.
For UBC Lung, raw data were adjusted to find significant p-value: eigenMT to correct for LD structure p-value, and then benjamini hochberg correction (FDR) to take into account of multiple testing.
EigenMT and FDR p-value correction were performed by Jing Chen as for the multi-ancestry lung function paper; detailed information on the analysis can be found in the main manuscript and related supplementary information (https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-023-01314-0/MediaObjects/41588_2023_1314_MOESM1_ESM.pdf).
Jing provided me with the list of significant gene-variant associations for UBC Lung eQTL after p-value correction. She also provided me backbone scripts to run colocalisation with this eQTL.

pQTL Look-up

Code
src/pQTL_coloc/000_preprocess_cs_b38.R

#!/bin/bash

#SBATCH --job-name=ukbpqtol_lookup
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=1:0:0
#SBATCH --mem=24gb
#SBATCH --account=gen1
#SBATCH --export=NONE


i=$((SLURM_ARRAY_TASK_ID))

#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"
THRESH=`bc <<< "scale=10; 0.05/1463"`
locus=('9_rs992969_5709697_6709697' '8_rs7824394_80792599_81792599' '6_rs148639908_90463614_91463614' '5_rs2188962_rs848_131270805_132496500' '5_rs1837253_rs3806932_109901872_110905675' '3_rs35570272_32547662_33547662' '2_rs6761047_242192858_243192858' '2_rs12470864_102426362_103426362' '17_17:38073838_CCG_C_37573838_38573838' '16_rs3024619_26864806_27864806' '15_rs17293632_66942596_67942596' '15_rs11071559_60569988_61569988' '12_rs73107993_47695873_48695873' '12_rs705705_55935504_56935504' '11_rs10160518_75796671_76796671' '10_rs201499805_rs1444789_8542744_9564361' '6_rs9271365_32086794_33086794')
CREDSET="${locus[i]}"
mkdir ${tmp_path}/ukb_pqtl/${CREDSET}

awk 'NR > 1 {print$0}'  ${tmp_path}/ukb_pqtl/SA_${CREDSET}_b38 | while IFS='' read -r LINE || [ -n "${LINE}" ]
do

chr=`echo ${LINE} | awk '{print $3}'`
pos=`echo ${LINE} | awk '{print $11}'`

/data/gen1/UKBiobank/olink/pQTL/pqtl_lookup.sh -r ${chr}:${pos}-${pos} -p ${THRESH} \
    >  ${tmp_path}/ukb_pqtl/${CREDSET}/SA_${chr}_${pos}_ukb_protein_lookup.txt
done

##run job as:
#sbatch --array 0-16 src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh

src/pQTL_coloc/000_submit_lookup_ukbpqtl.sh

#!/usr/bin/env Rscript

#Rationale: Create separate credible sets file for each genomic locus.

args     = commandArgs(trailingOnly = TRUE)
cred_set = args[1]
tmp_path = args[2]
cred_set_b38 = args[3]

library(tidyverse)
library(data.table)

cs <- fread(cred_set)
b38 <- fread(cred_set_b38)
colnames(b38) <- c("chr_b38","pos_b38","pos1_b38")
cs_b38 <- cbind(cs,b38)

pheno <- unique(cs_b38$locus)

#split credset df into separate file for each locus:
##NB: allele2 is the effect allele, I change them because in /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat a1 is the effect allele, a2 is the non effect allele

for (i in pheno){
cs_b38_tmp <- cs_b38 %>% filter(locus == as.character(i))
locus <- as.character(unique(cs_b38_tmp$locus))
fwrite(cs_b38_tmp, paste0(tmp_path,"SA_",locus,"_b38"), quote=F, sep="\t")}

src/pQTL_coloc/001_combine_pqtl.awk

#!/usr/bin/awk -f

#Rationale: combine all the UKB-pQTL with significant variant-pQTL association after look-up analysis in severe asthma GWAS
#credible set variants.

BEGIN {
    OFS = "\t"
}

NR == 1 {
    print "LOCUS", $0
}

FNR > 1 {
    sub("_ukb_protein_lookup.txt", "", FILENAME)
    print FILENAME, $0
}

src/pQTL_coloc/000_submit_lookup_scallop.sh

#!/bin/bash

#SBATCH --job-name=scallop_pqtl_lookup
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=01:00:00
#SBATCH --mem=24gb
#SBATCH --account=gen1
#SBATCH --export=NONE


#Rationale: look-up in pQTL in SCALLOP pQTL - script from Chiara

PQTL_DATA="/data/gen1/reference/SCALLOP"
SNP_LIST="/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt"
PQTL_PATH="/scratch/gen1/nnp5/Var_to_Gen_tmp/scallop_pqtl"

## List of variants with columns: ID, CHROM, POS
cat ${SNP_LIST} | awk '{print $3"_"$4"_"$5"_"$6,$3,$4}' > ${PQTL_PATH}/snps_list.txt

# Set up log file
touch ${PQTL_PATH}/log_pQTL_SCALLOP_analysis
echo -ne "FILENAME\tPROTEIN\tN_LOOKUP\tN_NOMINAL\tN_SIG" > ${PQTL_PATH}/log_pQTL_SCALLOP_analysis
echo >> ${PQTL_PATH}/log_pQTL_SCALLOP_analysis


# Perform look-up using Nick's awk script
for filename in ${PQTL_DATA}/*.txt.gz
do
  ## Set file name
  name=$(basename ${filename} .txt.gz)
  ## Header for output
  zcat ${filename} | head -1 > ${PQTL_PATH}/${name}.txt
  ## Perform look-up
  /home/n/nnp5/PhD/PhD_project/Var_to_Gene/src/pQTL_coloc/000_scallop_lookup.awk \
      ${PQTL_PATH}/snps_list.txt <(zcat $filename) > ${PQTL_PATH}/${name}.txt
  ## Report look-up statistics in log file
  N_total=`cat ${PQTL_PATH}/${name}.txt | tail -n +2 -q | wc -l`
  N_nominal=`cat ${PQTL_PATH}/${name}.txt | awk '$8 < 0.05 {print $0}' | wc -l`
  N_sig=`cat ${PQTL_PATH}/${name}.txt | awk '$8 < 5E-8 {print $0}' | wc -l`
  echo -ne "${name}.txt\t${name}\t${N_total}\t${N_nominal}\t${N_sig}" >> ${PQTL_PATH}/log_pQTL_SCALLOP_analysis
  echo >> ${PQTL_PATH}/log_pQTL_SCALLOP_analysis
done

src/pQTL_coloc/000_scallop_lookup.awk

#!/usr/bin/awk -f

# For each line of the first input file (the proxies) add a chr:pos key to the proxies array
# The array keys are unique so there won't be any duplicates even if you 
# add the same chr, pos twice.
#
# NR = line number over all input files
# FNR = line number in current file
# Hence when NR == FNR you must be in the first input file
# "next" means go to the next line in the file without processing any subsequent rules
NR == FNR {
    proxies[$2][$3] = 1
    next
}

# For the second input file (the pQTL results) output the header
# (processing will only get this far if the rule above didn't match)
FNR == 1 {
    print
}

# For subsequent lines in the the second file 
FNR > 1 {

    # Store a copy of the original line to output
    line = $0

    # Substitute ":" to split chr, pos into fields for matching
    gsub(":", "\t", $0)
}

# Print the original line if chr, pos is in our array of proxies
proxies[$1][$2] {
    print line
}

src/pQTL_coloc/000_submit_lookup_decode.sh

#!/bin/bash

#SBATCH --job-name=decode_pqtl_lookup
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=72:00:00
#SBATCH --mem=70gb
#SBATCH --account=gen1
#SBATCH --export=NONE


#Rationale: look-up in pQTL in deCODE pQTL - script from Chiara
ukbb38_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/ukb_pqtl"
PQTL_DATA="/data/gen1/pQTL/Ferkingstad_2021"
PQTL_PATH="/scratch/gen1/nnp5/Var_to_Gen_tmp/decode_pqtl"

#USE b38 POSITION:
##NOTE: DEPENDENCIES ON SCRIPT src/pQTL_coloc/000_preprocess_cs_b38.R TO CREATE B38 FILES:
## List of variants with columns: ID (chr_posb37_a1_a2), CHROM, POS (pos b38):
awk 'NR >1 {print $3"_"$4"_"$5"_"$6, "chr"$3, $11}' \
    ${ukbb38_path}/SA_*_b38 | grep -v "^chromosome" > ${PQTL_PATH}/snps_list.txt

# Set up log file
touch ${PQTL_PATH}/log_pQTL_decode_analysis
echo -ne "FILENAME\tPROTEIN\tN_LOOKUP\tN_NOMINAL\tN_SIG" > ${PQTL_PATH}/log_pQTL_decode_analysis
echo >> ${PQTL_PATH}/log_pQTL_decode_analysis


# Perform look-up using Nick's awk script
for filename in ${PQTL_DATA}/*.txt.gz
do
  ## Set file name
  name=$(basename ${filename} .txt.gz)
  ## Header for output
  zcat ${filename} | head -1 > ${PQTL_PATH}/${name}.txt
  ## Perform look-up
  ## Perform look-up
  /home/n/nnp5/PhD/PhD_project/Var_to_Gene/src/pQTL_coloc/000_decode_lookup.awk \
      ${PQTL_PATH}/snps_list.txt <(zcat $filename) > ${PQTL_PATH}/${name}.txt
  ## Report look-up statistics in log file
  N_total=`cat ${PQTL_PATH}/${name}.txt | tail -n +2 -q | wc -l`
  N_nominal=`cat ${PQTL_PATH}/${name}.txt | awk '$10 < 0.05 {print $0}' | wc -l`
  N_sig=`cat ${PQTL_PATH}/${name}.txt | awk '$10 < 1.8E-9 {print $0}' | wc -l`
  echo -ne "${name}.txt\t${name}\t${N_total}\t${N_nominal}\t${N_sig}" >> ${PQTL_PATH}/log_pQTL_decode_analysis
  echo >> ${PQTL_PATH}/log_pQTL_decode_analysis
done

src/pQTL_coloc/000_decode_lookup.awk

#!/usr/bin/awk -f

# For each line of the first input file (the proxies) add a chr:pos key to the proxies array
# The array keys are unique so there won't be any duplicates even if you 
# add the same chr, pos twice.
#
# NR = line number over all input files
# FNR = line number in current file
# Hence when NR == FNR you must be in the first input file
# "next" means go to the next line in the file without processing any subsequent rules
NR == FNR {
    proxies[$2][$3] = 1
    next
}

# For the second input file (the pQTL results) output the header
# (processing will only get this far if the rule above didn't match)
FNR == 1 {
    print
}

# For subsequent lines in the the second file 
FNR > 1 {

    # Store a copy of the original line to output
    line = $0

    # Substitute ":" to split chr, pos into fields for matching
    gsub(":", "\t", $0)
}

# Print the original line if chr, pos is in our array of proxies
proxies[$1][$2] {
    print line
}

I decided to perform a look-up of the 615 credible set variants in three pQTL sources: deCODE plasma, SCALLOP, and UKBiobank pQTL data. Significant
deCODE plasma pQTL (https://pubmed.ncbi.nlm.nih.gov/34857953/) deCODE is a free collection of protein plasma pQTL data for 4,907 proteins and 27.2 million tested variants. They used Bonferroni corrected p-value for multiple testing (0.05/27,200,000=1.8x10−9). They included variants with a MAF > 0.01% and imputation score > 0.9. If the associated variant was within 1Mb from the transcription start site of the gene, the pQTL was defined as cis, if outside this threshold as trans.
SCALLOP pQTL (https://pubmed.ncbi.nlm.nih.gov/33067605/)
SCALLOP (Systematic and Combined Analysis of Olink Proteins) pQTL combines measures from 90 circulating cardiovascular proteins in about 30,000 individuals from 13 cohorts.
UK Biobank pQTL (https://www.nature.com/articles/s41586-023-06592-6) The Pharma Proteomics Project led to the analysis of 2,923 plasma proteins in 54,219 UK Biobank participants overall. Within our group, the pQTL association was performed again in 48,195 European samples (as defined in Shrine et al. 2023) and for 1,463 proteins, including 44.8 million MAC >= 5 variants.
Variant-protein associations were defined as significant if pQTL p-value was less or equal to 1.8E-9 for deCODE, 5E-8 for SCALLOP, and 3.41E-5 (0.05/1,463) for UK Biobank pQTL.

Polygenic Priority Score (PoPS)

Code src/PoPS/PoPS.sh

#!/usr/bin/env bash

#Rationale: run PoPS with severe asthma summary statistics
#Based on scripts and log files found in: /data/gen1/TSH/PoPS
#PoPS v0.1: https://github.com/FinucaneLab/pops/tree/add-license-1
#software stored in /data/gen1/LF_HRC_transethnic/PoPS/pops

module load R
#intermediate files in:
tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp/"

#Download MAGMA and PoPS:
cd /home/n/nnp5/software
wget https://ctg.cncr.nl/software/MAGMA/prog/magma_v1.10.zip
unzip https://ctg.cncr.nl/software/MAGMA/prog/magma_v1.10.zip
cd /home/n/nnp5/PhD/PhD_project/Var_to_Gene

#Step 0: Generate gene-level association statistics as by MAGMA to have genes.out and genes.raw files:
##bfile and gene-annot: already downloaded with respect to TSH/lung function analysis - use the same.
## pval: sumstats file from severe asthma GWAS as this:
#SNP    p   N
#rs147324274    0.9139  124358
#rs369318156    0.4323  124358

#Create the .sumstats file:
awk '{print $1, $9, $3=46086}' \
    /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat | \
    sed 's/snpid pval 46086/SNP p N/g' > ${tmp_path}/pops/SA.sumstat

/home/n/nnp5/software/magma \
    --bfile /data/gen1/LF_HRC_transethnic/PoPS/data/1000G.EUR \
      --gene-annot /data/gen1/LF_HRC_transethnic/PoPS/data/magma_0kb.genes.annot \
      --pval ${tmp_path}/pops/SA.sumstat ncol=N \
      --gene-model snp-wise=mean \
      --out ${tmp_path}/pops/SA


#Step 1: Select features
#the features are in: /data/gen1/LF_HRC_transethnic/PoPS/data/PoPS.features.txt.gz
sbatch src/PoPS/submit_pops.sh

src/PoPS/submit_pops.sh

#!/bin/bash

#SBATCH --job-name=SA_POPS
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=60:00:00
#SBATCH --mem=50gb
#SBATCH --account=gen1
#SBATCH --export=NONE

#Rationale: PoPS feature selection and score calculation

tmp_path="/scratch/gen1/nnp5/Var_to_Gen_tmp"
popsdir="/data/gen1/LF_HRC_transethnic/PoPS"

module load python3

python ${popsdir}/pops/pops.feature_selection.py \
    --features ${popsdir}/data/PoPS.features.txt.gz \
    --gene_results ${tmp_path}/pops/SA \
      --out ${tmp_path}/pops/SA \

for i in {1..22}
do
    python ${popsdir}/pops/pops.predict_scores.py \
        --gene_loc ${popsdir}/data/gene_loc.txt \
        --gene_results ${tmp_path}/pops/SA \
        --features ${popsdir}/data/PoPS.features.txt.gz \
        --selected_features  ${tmp_path}/pops/SA.features \
        --control_features ${popsdir}/data/control.features \
        --chromosome ${i} \
        --out ${tmp_path}/pops/SA
done

src/PoPS/PoPS_summary.R

#!/usr/bin/env Rscript

#Rationale: Look at PoPS results and create a summary of them
##We currently suggest taking the highest scoring gene in each GWAS locus.
##You could further filter this set to only include genes in top 10% of PoP scores across all genes.
##Negative scores generally mean low evidence -- this is the predicted MAGMA z-score!!’ (https://github.com/FinucaneLab/pops/issues/4)
##From TSH paper: ‘prioritized genes for all autosomal TSH sentinel variants within a 500kb (±250kb) window of
#the sentinel variant and reported the top prioritised genes in the region.
#If there was no gene prioritized within a 500kb window of the sentinel,
#we reported any top prioritized genes within a 1Mb window (Supplementary Data 19).’.


library(data.table)
library(tidyverse)

sig_list <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt")
setnames(sig_list,"chromosome","chr")
setnames(sig_list,"position","pos")
sig_list$sentinel <- paste0(sig_list$chr,"_",sig_list$pos,"_",sig_list$allele1,"_",sig_list$allele2)
locus <- unique(sig_list$locus)
gene_loc <- fread("/data/gen1/LF_HRC_transethnic/PoPS/data/gene_loc.txt")
gene_features <- fread("/data/gen1/LF_HRC_transethnic/PoPS/data/PoPS.features.txt.gz")

##### sentinel 500 kb total window #####
result <- data.frame(matrix(ncol = 16,nrow = 0))
for(i in locus){
    locus_sig_list <- sig_list %>% filter(locus == as.character(i))
    sentinel <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(sentinel)
    chr <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(chr)
    pos <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(pos)
    print(sentinel)
    trait <- "SA"
    if(chr!="X"){
        chr <- as.integer(chr)
        pos <- as.integer(pos)
        pos1 <- pos-250000 # window set
        pos2 <- pos+250000 # window set
        if(pos1<0){
            pos1 <- 0
        }
        genes <- gene_loc[gene_loc$CHR==chr,]
        genes <- genes[(genes$START>=pos1&genes$START<=pos2)|(genes$END>=pos1&genes$END<=pos2),]
        n_genes <- nrow(genes)
        if(n_genes!=0){ 
            result1 <- data.frame(matrix(ncol = 16,nrow = 0))          
            for(j in 1:n_genes){
                result2 <- data.frame(matrix(ncol = 16,nrow = 0))
                gene <- genes$ENSGID[j]
                result2[1,1] <- gene
                result2[1,2] <- sentinel
                PoPS <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/",trait,".",chr,".results"))
                score <- PoPS[PoPS$ENSGID==gene,]$Score[1]
                result2[1,3] <- score
                result2[1,4] <- i
                beta <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/",trait,".",chr,".coefs"))
                X_all <- gene_features[ENSGID==gene,]
                X <- select(X_all,beta$Feature)
                X <- as.data.table(t(X))
                beta$X <- X$V1
                beta <- beta[,contribute:=X*beta_hat]
                beta <- beta[order(abs(contribute),decreasing=TRUE)]
                result2[1,7] <- beta$Feature[1]
                result2[1,8] <- beta$Feature[2]
                result2[1,9] <- beta$Feature[3]
                result2[1,10] <- beta$Feature[4]
                result2[1,11] <- beta$Feature[5]
                result2[1,12] <- beta$Feature[6]
                result2[1,13] <- beta$Feature[7]
                result2[1,14] <- beta$Feature[8]
                result2[1,15] <- beta$Feature[9]
                result2[1,16] <- beta$Feature[10]
                result1 <<- rbind(result1,result2)      
            }
            result1 <- as.data.table(result1)
            result1 <- result1[order(X3,decreasing=TRUE)]
            result1$X6 <- FALSE
            result1$X6[1] <- TRUE
            for(j in 1:n_genes){
                result1$X5[j] <- j
            }
            result <<- rbind(result,result1)
        }
    }
}
names(result) <- c("ENSGID","sentinel","PoPS_score","signal_id","gene_rank","prioritized","Feature1","Feature2","Feature3","Feature4","Feature5","Feature6","Feature7","Feature8","Feature9","Feature10")
result <- result[order(signal_id,gene_rank)]
write.table(result,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_250kb_window.txt", row.names=F, quote=F, sep="\t")
#save results for only prioritised genes:
write.table(result[gene_rank==1,],"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/prioritized_genes_250kb_window.txt", row.names=F, quote=F, sep="\t")
#save prioritised genes only:
genes_250 <- unique(result %>% filter(gene_rank == 1) %>% select(ENSGID))
write.table(genes_250,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/pops_var2genes_raw_250kbwindow.txt", row.names=F, quote=F, sep="\t", col.names=F)
#two locus not present when looking at 500kb total window:
#10_rs201499805_rs1444789_8542744_9564361 and 15_rs11071559_60569988_61569988

##### sentinel 1 mb total window #####
result <- data.frame(matrix(ncol = 16,nrow = 0))
for(i in locus){
    locus_sig_list <- sig_list %>% filter(locus == as.character(i))
    sentinel <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(sentinel)
    chr <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(chr)
    pos <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(pos)
    print(sentinel)
    trait <- "SA"
    if(chr!="X"){
        chr <- as.integer(chr)
        pos <- as.integer(pos)
        pos1 <- pos-500000 # window set
        pos2 <- pos+500000 # window set
        if(pos1<0){
            pos1 <- 0
        }
        genes <- gene_loc[gene_loc$CHR==chr,]
        genes <- genes[(genes$START>=pos1&genes$START<=pos2)|(genes$END>=pos1&genes$END<=pos2),]
        n_genes <- nrow(genes)
        if(n_genes!=0){ 
            result1 <- data.frame(matrix(ncol = 16,nrow = 0))          
            for(j in 1:n_genes){
                result2 <- data.frame(matrix(ncol = 16,nrow = 0))
                gene <- genes$ENSGID[j]
                result2[1,1] <- gene
                result2[1,2] <- sentinel
                PoPS <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/",trait,".",chr,".results"))
                score <- PoPS[PoPS$ENSGID==gene,]$Score[1]
                result2[1,3] <- score
                result2[1,4] <- i
                beta <- fread(paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/",trait,".",chr,".coefs"))
                X_all <- gene_features[ENSGID==gene,]
                X <- select(X_all,beta$Feature)
                X <- as.data.table(t(X))
                beta$X <- X$V1
                beta <- beta[,contribute:=X*beta_hat]
                beta <- beta[order(abs(contribute),decreasing=TRUE)]
                result2[1,7] <- beta$Feature[1]
                result2[1,8] <- beta$Feature[2]
                result2[1,9] <- beta$Feature[3]
                result2[1,10] <- beta$Feature[4]
                result2[1,11] <- beta$Feature[5]
                result2[1,12] <- beta$Feature[6]
                result2[1,13] <- beta$Feature[7]
                result2[1,14] <- beta$Feature[8]
                result2[1,15] <- beta$Feature[9]
                result2[1,16] <- beta$Feature[10]
                result1 <<- rbind(result1,result2)      
            }
            result1 <- as.data.table(result1)
            result1 <- result1[order(X3,decreasing=TRUE)]
            result1$X6 <- FALSE
            result1$X6[1] <- TRUE
            for(j in 1:n_genes){
                result1$X5[j] <- j
            }
            result <<- rbind(result,result1)
        }
    }
}
names(result) <- c("ENSGID","sentinel","PoPS_score","signal_id","gene_rank","prioritized","Feature1","Feature2","Feature3","Feature4","Feature5","Feature6","Feature7","Feature8","Feature9","Feature10")
result <- result[order(signal_id,gene_rank)]
write.table(result,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_500kb_window.txt", row.names=F, quote=F, sep="\t")
write.table(result[gene_rank==1,],"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/prioritized_genes_500kb_window.txt", row.names=F, quote=F, sep="\t")
#save prioritised genes only:
genes_500 <- unique(result %>% filter(gene_rank == 1) %>% select(ENSGID))
write.table(genes_500,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/pops_var2genes_raw_500kbwindow.txt", row.names=F, quote=F, sep="\t", col.names=F)
#locus 15_rs11071559_60569988_61569988 reveals a gene !

################# mapping features ###############
# Get gene information using biomaRt for all genes
library(biomaRt)         # Requires R/4.1.0
#250Kb window results
pops <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/prioritized_genes_250kb_window.txt")
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl", version = "GRCh37")
genes <- as_tibble(getBM(attributes=c('hgnc_symbol', 'ensembl_gene_id', 'strand'),
  filters=c('ensembl_gene_id'),
  values=list(sort(unique(pops$ENSGID))),
  mart=ensembl)) %>%
  rename(gene_symbol = hgnc_symbol, gene_strand = strand)

genes <- as.data.table(genes)
setnames(genes,"ensembl_gene_id","ENSGID")
merged <- merge(pops,genes,by="ENSGID")
write.table(merged,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_250kb_window_gene_mapped.txt", row.names=F, quote=F, sep="\t")

#500Kb window results
pops <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/prioritized_genes_500kb_window.txt")
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl", version = "GRCh37")
genes <- as_tibble(getBM(attributes=c('hgnc_symbol', 'ensembl_gene_id', 'strand'),
  filters=c('ensembl_gene_id'),
  values=list(sort(unique(pops$ENSGID))),
  mart=ensembl)) %>%
  rename(gene_symbol = hgnc_symbol, gene_strand = strand)

genes <- as.data.table(genes)
setnames(genes,"ensembl_gene_id","ENSGID")
merged <- merge(pops,genes,by="ENSGID")
write.table(merged,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_500kb_window_gene_mapped.txt", row.names=F, quote=F, sep="\t")

detach("package:biomaRt", unload=TRUE)

##########################################################
w250 <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_250kb_window_gene_mapped.txt") %>%
        mutate(window="250kb")
w500 <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_result_summary_500kb_window_gene_mapped.txt") %>%
        mutate(window="500kb")
w500_use <- w500[!sentinel%in%w250$sentinel,]
merged <- rbind(w250,w500_use)
merged <- merged[PoPS_score>0,]

write.table(merged,"/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_results_merged_table.txt", row.names=F, quote=F, sep="\t")

#save final list of genes:
genes <- unique(merged %>% select(gene_symbol))
write.table(genes,"/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/pops_var2genes_raw.txt", row.names=F, quote=F, sep="\t")

PoPS is a gene prioritisation analysis that uses genome-wide signals from GWAS summary statistics and identified prioritised genes looking into public bulk and single-cell expression datasets, curated biological pathways, and predicted protein-protein interactions (https://github.com/FinucaneLab/pops; https://www.nature.com/articles/s41588-023-01443-6).
PoPS includes three steps:

  • Step 0: Generate MAGMA scores: run MAGMA to obtain gene association statistics (z-scores) using severe asthma GWAS sumstats and 1000G European individuals;

  • Step 1: Select features: looking at the MAGMA scores, certain features are selected since they are enriched, and stored in the .feature output files;

  • Step 2: Run PoPS: calculate the POP score for each gene using a joint model with a leave-one chromosome out method for the enrichment of all selected features

For each variant, I analysed all the genes within 500Kb and reported the top prioritised genes as by PoPS; if no prioritised genes, I enlarged the genomic window to 1MB.

Nearby Mendelian rare disease-genes

src/rare_disease/rare_disease.r

#!/usr/bin/env Rscript

#Rationale: Find nearby rare mendelian genes that are within +/- 500Kb from SA GWAS sentinels.

library(GenomicRanges)
library(rtracklayer)
library(tidyverse)
library(pander)
library(scales)
library(readxl)
library(data.table)

# Define distance to look up nearby rare disease associated genes
DISTANCE <- 5e5
# funtion to work out the number unique element of the yth column of data table x
getN <- function(x, y) x %>% pull({{y}}) %>% n_distinct

## retrieve genomic reference for genes
ucsc = browserSession("UCSC")  # methods for getting browser sessions
genome(ucsc) <- "hg19"  # setting the sequence information stored in ucsc
# genomic reference for genes
refseq <- ucsc %>%
  ucscTableQuery(track="NCBI RefSeq", table="refGene") %>%
  getTable %>%
  as_tibble

## genes and rare disease association from orphadata 
#https://www.orphadata.com/data/xml/en_product6.xml
orphanet_genes <- read_xlsx("/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/en_product6.xlsx") %>%
  select(Symbol, Gene=Name12, OrphaCode, Disease=Name, SourceOfValidation,
         DisorderGeneAssociationType=Name24) %>%
  distinct

# hpo: human phenotype ontology provides a standardized vocabulary of phenotypic abnormalities encountered in human disease
#https://www.orphadata.org/data/xml/en_product4.xml
#create the levels for HPOFrequency column:
hpo_f_levels <- c("Obligate (100%)","Very frequent (99-80%)","Frequent (79-30%)","Occasional (29-5%)","Very rare (<4-1%)","Excluded (0%)")
orphanet_hpo <- read_xlsx("/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/en_product4.xlsx") %>%
  select(OrphaCode, HPOId, HPOTerm, HPOFrequency=Name15) %>%
  distinct %>%
  mutate(HPOFrequency=factor(HPOFrequency, levels=hpo_f_levels))

##Retrieve Sentinel SNPs - highest PIP in the fine-mapped loci:
sig_list_tmp <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt")
setnames(sig_list_tmp,"chromosome","chr")
setnames(sig_list_tmp,"position","pos")
sig_list_tmp$sentinel <- paste0(sig_list_tmp$chr,"_",sig_list_tmp$pos,"_",sig_list_tmp$allele1,"_",sig_list_tmp$allele2)
locus <- unique(sig_list_tmp$locus)

sentinels <- data.frame(matrix(ncol = 4,nrow = 0))
colnames(sentinels) <- c("locus","sentinel","chr","pos")
for(i in locus){
    locus_sig_list <- sig_list_tmp %>% filter(locus == as.character(i))
    locus_sig_list <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(locus,sentinel,chr,pos)
    sentinels <- rbind(sentinels,locus_sig_list)
    }
fwrite(sentinels, "input/highest_PIP_sentinels",row.names=F,quote=F,sep="\t") #save this for easiness to retrieve them.

## Overlap of GWAS function signals with NCBI Refseq genes
# Create a GRanges object for reference of genes
refseq.GRanges <- refseq %>%
  mutate(chrom=sub("^chr", "", chrom)) %>%
  select(Symbol=name2, chrom, start=txStart, end=txEnd) %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE) # takes a data-frame-like object as input and tries to automatically find the columns that describe genomic ranges. It returns them as a GRanges object 
# Create a GRanges object for our signals
sentinels.GRanges <- sentinels %>%
  select(chr, start=pos, sentinel) %>%
  makeGRangesFromDataFrame(end.field = "start", keep.extra.columns = TRUE)
# overlap checking
nearby_genes <- mergeByOverlaps(sentinels.GRanges, refseq.GRanges, maxgap=DISTANCE) %>%
  as_tibble %>%
  select(sentinel, Position=sentinels.GRanges.start, Symbol,
         txStart=refseq.GRanges.start, txEnd=refseq.GRanges.end,
         width=refseq.GRanges.width) %>%
  mutate(distance=ifelse(txStart <= Position & txEnd >= Position, 0,
                         ifelse(Position < txStart, txStart - Position, Position - txEnd))) %>%
  group_by(sentinel, Symbol) %>%   # most data operation are done one groups defined by variables
  filter(distance == min(distance)) %>%
  filter(width == max(width)) %>%
  slice(1) %>%
  ungroup

n_refseq <- refseq %>% getN(name2)
n_nearby_genes_Sentinel <- nearby_genes %>% getN(sentinel)
n_nearby_genes_Symbol <- nearby_genes %>% getN(Symbol)
n_genes <- orphanet_genes %>% getN(Symbol)
n_disease <- orphanet_genes %>% getN(OrphaCode)
n_disease_with_hpo <- orphanet_hpo %>% getN(OrphaCode)
n_hpo <- orphanet_hpo %>% getN(HPOId)

# Merge Orphanet genes with HPO terms and keep only Disease-causing germline genes
orphanet <- left_join(orphanet_genes, orphanet_hpo) %>%
  filter(grepl("Disease-causing germline", DisorderGeneAssociationType)) %>%
  mutate(HPOFrequency = fct_explicit_na(HPOFrequency, "No HPO term"))

n_gene_disease_c <- orphanet %>% group_by(Symbol, Disease) %>% n_groups 
n_genes_c <- orphanet %>% getN(Symbol)
n_disease_c <- orphanet %>% getN(Disease)
n_disease_hpo_c <- orphanet %>% group_by(Disease, HPOId) %>% n_groups 
n_disease_with_hpo_c <- orphanet %>% filter(!is.na(HPOId)) %>% getN(Disease)
n_hpo_c <- orphanet %>% getN(HPOId)

orphanet %>%
  count(HPOFrequency) %>%
  mutate(pc=(n/sum(n)) %>% percent) %>%
  pander(big.mark=",", caption=paste("Table 1a: Distribution of HPO term frequencies across",
  nrow(orphanet) %>% comma, "rows of gene-disease-hpo associations for Disease-causing germline genes"))

orphanet %>%
  group_by(Symbol) %>%
  arrange(HPOFrequency) %>%
  slice(1) %>%
  ungroup %>%
  count(HPOFrequency) %>%
  mutate(pc=(n/sum(n)) %>% percent) %>%
  pander(big.mark=",", caption=paste("Table 1b: Distribution of highest HPO term frequency per gene across",
                                     n_genes_c %>% comma, "Disease-causing germline genes"))

## Filtering for asthma terms
red <- hpo_f_levels[1:2]
yellow <- hpo_f_levels[3]
green <- hpo_f_levels[4:5]

#HPOId term for asthma: HP:0002099
key_terms <- c("asthma","eosin","immunodef","cili","autoimm","leukopenia","neutropenia","macroph")
orphanet <- orphanet %>%
  mutate(asthma_HPO=grepl(paste(key_terms, collapse='|'), HPOTerm, ignore.case = TRUE) &
                   HPOFrequency != "Excluded (0%)",
         asthma_disease=grepl(paste(key_terms, collapse='|'), Disease, ignore.case = TRUE),
         asthma=(asthma_HPO | asthma_disease),
         overlap=Symbol %in% nearby_genes$Symbol,
         evidence=ifelse(!asthma, "Not asthma related",
                         ifelse(asthma_disease | HPOFrequency %in% red,
                         "Disease name, obligate or very frequent",
                         ifelse(HPOFrequency %in% yellow, "Frequent",
                                ifelse(HPOFrequency %in% green, "Occasional or rare", "Excluded")))) %>%
           factor(levels = c("Disease name, obligate or very frequent",
                             "Frequent",
                             "Occasional or rare",
                             "Excluded",
                             "Not asthma related")))

n_asthma_genes_disease <- orphanet %>% filter(asthma_disease) %>% getN(Symbol)
n_asthma_genes_hpo <- orphanet %>% filter(asthma_HPO) %>% getN(Symbol)

orphanet_asthma <- orphanet %>%
  filter(asthma)

n_asthma_genes <- orphanet_asthma %>% getN(Symbol)

orphanet_asthma_selected <- orphanet_asthma %>%
  filter(overlap) %>%
  select(-overlap)

n_asthma_genes_selected <- orphanet_asthma_selected %>% getN(Symbol)

table2a <- orphanet %>%
  group_by(Symbol) %>%
  summarise_at(vars(asthma, overlap), any) %>%
  ungroup %>%
  count(asthma, overlap) %>%
  pivot_wider(names_from = "overlap", values_from = "n", names_prefix = "overlap_") %>%
  mutate_at(vars(starts_with("overlap")), list(pc=~(./sum(.)) %>% percent))

table2a_chisq <- table2a %>% select(2:3) %>% chisq.test

table2b <- orphanet %>%
  group_by(Symbol) %>%
  arrange(evidence) %>%
  slice(1) %>%
  ungroup %>%
  count(evidence, overlap) %>%
  pivot_wider(names_from = "overlap", values_from = "n", names_prefix = "overlap_",
              values_fill = list(n=0)) %>%
  mutate_at(vars(starts_with("overlap")), list(pc=~(./sum(.)) %>% percent))

table2b_chisq <- table2b %>% select(2:3) %>% chisq.test

pander(table2a, big.mark=",",
       caption=paste0("Table 2a: For", n_genes_c %>% comma, "rare Disease-causing, germline genes, the numbers of asthma and non-asthma related genes vs. numbers overlapping severe asthma signals ", DISTANCE/1000, "kb. chi^2 P =", table2a_chisq$p.value %>% round(3)))

pander(table2b, big.mark=",",
       caption=paste0("Table 2b: For", n_genes_c %>% comma, "rare Disease-causing, germline genes, the numbers of genes in each asthma evidence category vs. numbers overlapping severe asthma signals ", DISTANCE/1000, "kb. chi^2 P =", table2b_chisq$p.value %>% round(3)), split.table=Inf)

## Hypergeometric test
hitInSample <- n_asthma_genes_selected
hitInPop <- n_asthma_genes
failInPop <- n_refseq - n_asthma_genes
sampleSize <- n_nearby_genes_Symbol

p_asthma_en <- phyper(q=hitInSample - 1,
                 m=hitInPop,
                 n=failInPop,
                 k=sampleSize,
                 lower.tail = FALSE)  # P value for hypergeometric test: 0.2486454

table3a.m <- matrix(c(hitInSample, hitInPop - hitInSample,
                      sampleSize - hitInSample, failInPop - sampleSize + hitInSample), 2, 2)
rownames(table3a.m) <- c("yes", "no")
colnames(table3a.m) <- c("asthma", "others")
table3a <- table3a.m %>%
  as_tibble(rownames = "Near severe asthma sentinel") %>%
  mutate(pc_asthma=(asthma/(asthma + others)) %>% percent)
pander(table3a, big.mark=",", caption=paste0("Table 3: For rare disease-causing genes, the proportion that are asthma related within ", DISTANCE/1000, "kb of a severe asthma sentinel."))

## By-gene and by-SNP results
concat <- function(x) {
  x <- x[!is.na(x)]
  if (length(x) > 0 & is.numeric(x)) {
    min(x, na.rm=T) %>% paste
  } else {
    x %>% unique %>% paste(collapse="; ")
  }
}

orphanet_asthma_selected_wide <- orphanet_asthma_selected %>%
  mutate_at(vars(starts_with("HPO")), ~ifelse(.data$asthma_HPO, ., NA)) %>%
  arrange(evidence) %>%
  group_by(Symbol, Gene) %>%
  summarise(Evidence=evidence[1],
            nDisease=n_distinct(Disease, na.rm = TRUE),
            Diseases=concat(Disease),
            Validation=concat(SourceOfValidation),
            nHPOTerm=n_distinct(HPOTerm, na.rm = TRUE),
            HPOTerms=concat(HPOTerm)) %>%
  ungroup %>%
  arrange(Evidence)
  implicated_rare <- right_join(nearby_genes, orphanet_asthma_selected_wide) %>%
  add_column(Rare="Rare", .before = "txStart")

results <- implicated_rare %>% select(-Position) %>% left_join(sentinels, .)

results_rare <- results %>% filter(!is.na(Rare)) %>% arrange(Evidence, distance, chr, pos) %>% ungroup

results_by_snp <- results %>%
  arrange(Evidence, distance) %>%
  mutate_at(vars(Rare), ~ifelse(!is.na(.), .data$Symbol, NA)) %>%
  group_by(sentinel) %>%
  summarise_at(vars(Rare, Evidence, distance), concat) %>%
  ungroup %>% 
  left_join(sentinels, .)

results_by_snp_phyper <- results_rare %>%
  group_by(sentinel) %>%
  summarise(hitInSample_snp=n()) %>%
  ungroup %>%
  left_join(nearby_genes %>%
              group_by(sentinel) %>%
              summarise(sampleSize_snp=n())) %>%
  mutate(hyperG_p=phyper(q=hitInSample_snp - 1,
                 m=hitInPop,
                 n=failInPop,
                 k=sampleSize_snp,
                 lower.tail = FALSE)) %>%
  left_join(results_by_snp, .)

results_by_gene <- results %>%
  filter(!is.na(Symbol)) %>%
  arrange(Evidence, distance) %>%
  mutate_at(vars(Rare), ~ifelse(!is.na(.), .data$sentinel, NA)) %>%
  group_by(Symbol) %>%
  summarise_at(vars(Rare, Evidence, distance, Diseases, HPOTerms), concat)

out_base <- paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/rare_disease/results_", DISTANCE/1000, "kb")
write_csv(results, paste0(out_base, ".csv"), na = "")
write_csv(results_rare, paste0(out_base, "_rare.csv"), na = "")
write_csv(results_by_snp_phyper, paste0(out_base, "_by_snp.csv"), na = "")
write_csv(results_by_gene, paste0(out_base, "_by_gene.csv"), na = "")
#save genes only:
fwrite(as.data.frame(results_by_gene$Symbol), "/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/rare_disease_genes_raw.txt", na = "",col.names=F,quote=F)

Using the ORPHANET resource (https://www.orpha.net/), I analysed if any genes within 500Kb the top causal variant for each fine mapped locus was associated with rare diseases showing implication in asthma. In doing so, I filtered the Human Phenotype Ontology (HPO) term or any disease including key terms ‘asthma’/‘eosin’/‘immunodef’/‘cili’/‘autoimm’/‘leukopenia’/‘neutropenia’/‘macroph’.

Nearby Mouse knockout orthologs

src/mouse_ko/mouse_ko.r

#!/usr/bin/env Rscript

#Rationale: Find nearby human ortholog of mouse knockout genes that are within +/- 500Kb from SA GWAS sentinels.

library(GenomicRanges)
library(rtracklayer)
library(tidyverse)
library(pander)
library(scales)
library(readxl)
library(data.table)

setwd("/scratch/gen1/nnp5/Var_to_Gen_tmp/mouse_ko/")

DISTANCE <- 5e5

getN <- function(x, y) x %>% pull({{y}}) %>% n_distinct


##Retrieve Sentinel SNPs - highest PIP in the fine-mapped loci:
sig_list_tmp <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt")
setnames(sig_list_tmp,"chromosome","chr")
setnames(sig_list_tmp,"position","pos")
sig_list_tmp$sentinel <- paste0(sig_list_tmp$chr,"_",sig_list_tmp$pos,"_",sig_list_tmp$allele1,"_",sig_list_tmp$allele2)
locus <- unique(sig_list_tmp$locus)

sentinels <- data.frame(matrix(ncol = 4,nrow = 0))
colnames(sentinels) <- c("locus","sentinel","chr","pos")
for(i in locus){
    locus_sig_list <- sig_list_tmp %>% filter(locus == as.character(i))
    locus_sig_list <- locus_sig_list %>% filter(PIP_average == max(locus_sig_list$PIP_average)) %>% select(locus,sentinel,chr,pos)
    sentinels <- rbind(sentinels,locus_sig_list)
    }

## retrieve genomic reference for genes
#resolved problem:
#if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install(version = "3.18")

ucsc <- browserSession("UCSC")
genome(ucsc) <- "hg19"
refseq <- ucsc %>%
  ucscTableQuery(track="NCBI RefSeq", table="refGene") %>%
  getTable %>%
  as_tibble

## Mouse genotype-phenotype data
## Store in /scratch/gen1/nnp5/Var_to_Gen_tmp/mouse_ko/ and downloaded from:
#latest release 2023-07-06:
#wget -P ${tmp_path}/mouse_ko/ https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/latest/results/genotype-phenotype-assertions-ALL.csv.gz
#latest release 2023-11-22:
#wget -P ${tmp_path}/mouse_ko/ http://ftp.ebi.ac.uk/pub/databases/genenames/hcop/human_mouse_hcop_fifteen_column.txt.gz

orthologs <- read_tsv("human_mouse_hcop_fifteen_column.txt.gz")
all_geno_pheno <- read_csv("genotype-phenotype-assertions-ALL.csv.gz")

geno_pheno <- all_geno_pheno %>%
  separate(top_level_mp_term_name, c("top_level_mp_term", "top_level_mp_subtype"), ",")

#display all the possible top level mp term and related number of genes:
geno_pheno %>%
  group_by(top_level_mp_term) %>%
  summarise(nlines=n(), ngenes=n_distinct(marker_symbol)) %>%
  drop_na %>%
  pander(big.mark=",")


## Overlap of severe asthma - SA signals with NCBI Refseq genes
refseq.GRanges <- refseq %>%
  select(Symbol=name2, chrom, start=txStart, end=txEnd) %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

SA.GRanges <- sentinels %>%
  mutate(Chrom=paste0("chr", chr)) %>%
  select(Chrom, start=pos, end=pos, sentinel) %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

nearby_genes <- mergeByOverlaps(SA.GRanges, refseq.GRanges, maxgap=DISTANCE) %>%
  as_tibble %>%
  select(sentinel, Position=SA.GRanges.start, Symbol,
         txStart=refseq.GRanges.start, txEnd=refseq.GRanges.end,
         width=refseq.GRanges.width) %>%
  mutate(distance=ifelse(txStart <= Position & txEnd >= Position, 0,
                         ifelse(Position < txStart, txStart - Position, Position - txEnd))) %>%
  group_by(sentinel, Symbol) %>%
  filter(distance == min(distance)) %>%
  filter(width == max(width)) %>%
  slice(1) %>%
  ungroup

n_refseq <- refseq %>% getN(name2)
n_nearby_genes_sentinel <- nearby_genes %>% getN(sentinel)
n_nearby_genes_Symbol <- nearby_genes %>% getN(Symbol)

## Orthologs of mouse KO genes with asthma relevant phenotype - BROAD filter
#BROAD filter for top_level_mp_term == "respiratory system phenotype" give immunity and/or muscle related term as well:
ko_mouse_mp <- geno_pheno %>%
  filter(top_level_mp_term == "respiratory system phenotype" |
         top_level_mp_term == "immune system phenotype" |
         top_level_mp_term == "muscle phenotype") %>%
  select(top_level_mp_term, mp_term_name) %>%
  distinct %>% arrange(top_level_mp_term,mp_term_name)
#save the top level and suptypes level that I use:
fwrite(ko_mouse_mp,"/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/MP_TERM_approx_asthma_respimmmuscle.txt",quote=F, sep="\t")

#Actual filter for relevant top level and subtypes
ko_mouse <- geno_pheno %>%
  filter(top_level_mp_term == "respiratory system phenotype" |
         top_level_mp_term == "immune system phenotype" |
         top_level_mp_term == "muscle phenotype") %>%
  select(marker_symbol, mp_term_name) %>%
  distinct

ko_human <- orthologs %>%
  filter(support != "OrthoMCL") %>%
  select(mouse_symbol, human_symbol) %>%
  inner_join(ko_mouse, c("mouse_symbol" = "marker_symbol")) %>%
  mutate(overlap=human_symbol %in% nearby_genes$Symbol)

no_ortholog <- setdiff(ko_mouse$marker_symbol, ko_human$mouse_symbol)
ko_human_selected <- ko_human %>%
  filter(overlap)

## Hypergeometric test
hitInSample <- nrow(ko_human_selected)
hitInPop <- nrow(ko_human)
failInPop <- n_refseq - hitInPop
sampleSize <- n_nearby_genes_Symbol

p_resp_en <- phyper(q=hitInSample - 1,
                 m=hitInPop,
                 n=failInPop,
                 k=sampleSize,
                 lower.tail = FALSE)

table3a.m <- matrix(c(hitInSample, hitInPop - hitInSample,
                      sampleSize - hitInSample, failInPop - sampleSize + hitInSample), 2, 2)
rownames(table3a.m) <- c("yes", "no")
colnames(table3a.m) <- c("asthma", "others")
table3a <- table3a.m %>%
  as_tibble(rownames = "Near severe asthma sentinel") %>%
  mutate(pc_asthma=(asthma/(asthma + others)) %>% percent)

pander(table3a, big.mark=",", caption=paste0("Table 3: For mouse knockout-causing genes, the proportion that are asthma related within ", DISTANCE/1000, "kb of an asthma function sentinel."))

### By-gene and by-SNP results
concat <- function(x) x[!is.na(x)] %>% unique %>% paste(collapse="; ")

implicated_mko <- right_join(nearby_genes, ko_human_selected, c("Symbol" = "human_symbol")) %>%
  add_column(MKO="MKO", .before = "txStart")

results <- left_join(sentinels, implicated_mko)

results_mko <- results %>%
  filter(!is.na(MKO)) %>%
  arrange(distance, chr, pos) %>%
  ungroup 

results_by_snp <- results %>%
  arrange(distance) %>%
  mutate_at(vars(MKO), ~ifelse(!is.na(.), .data$Symbol, NA)) %>%
  group_by(sentinel) %>%
  summarise_at(vars(MKO, distance), concat) %>%
  ungroup %>% 
  left_join(sentinels, .) 

results_by_snp_phyper <- results_mko %>%
  group_by(sentinel) %>%
  summarise(hitInSample_snp=n()) %>%
  ungroup %>%
  left_join(nearby_genes %>%
              group_by(sentinel) %>%
              summarise(sampleSize_snp=n())) %>%
  mutate(hyperG_p=phyper(q=hitInSample_snp - 1,
                 m=hitInPop,
                 n=failInPop,
                 k=sampleSize_snp,
                 lower.tail = FALSE)) %>%
  left_join(results_by_snp, .)

results_by_gene <- results %>%
  filter(!is.na(Symbol)) %>%
  arrange(distance) %>%
  mutate_at(vars(MKO), ~ifelse(!is.na(.), .data$sentinel, NA)) %>%
  group_by(Symbol) %>%
  summarise_at(vars(MKO, distance), concat) %>%
  ungroup 

results_by_snp_phyper %>% filter(!is.na(hyperG_p)) %>% count(hyperG_p < 0.05) %>%
  pander(caption="By SNP hypergeometric results")

out_base <- paste0("/scratch/gen1/nnp5/Var_to_Gen_tmp/mouse_ko/results_", DISTANCE/1000, "kb")
write_csv(results, paste0(out_base, ".csv"), na = "")
write_csv(results_mko, paste0(out_base, "_mko.csv"), na = "")
write_csv(results_by_snp_phyper, paste0(out_base, "_by_snp.csv"), na = "")
write_csv(results_by_gene, paste0(out_base, "_by_gene.csv"), na = "") # gives me the gene list
MP_TERM <- unique(ko_mouse_mp$mp_term_name)
inner_join(results_mko %>%
             select(-MKO, -txStart, -txEnd, -width, -overlap),
           results_by_snp_phyper %>% select(sentinel, hyperG_p)) %>%
  inner_join(all_geno_pheno %>%
               filter(mp_term_name %in% MP_TERM) %>%
               select(marker_symbol, mp_term_name, p_value, percentage_change, effect_size),
             c("mouse_symbol" = "marker_symbol")) %>%
  group_by(Symbol, mouse_symbol) %>%
  arrange(p_value) %>%
  slice(1) %>%
  write_csv("mouse_ko.csv")

#save genes only:
fwrite(as.data.frame(results_by_gene$Symbol), "/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/mouse_ko_genes_raw.txt", na = "",col.names=F,quote=F)

Using the International Mouse Phenotyping (website link), I analysed if any human ortholog gene has been studied in mouse model in the context of respiratory, immune or muscle phenotypes (top Mouse Phenotype (MP) terms ‘respiratory system phenotype’/‘immune system phenotype’/‘muscle phenotype’. I selected relevant genes within 500Kb from the top causal variant for each fine mapped locus.

Rare Variant Analysis

src/rare_variant/000_dataprep_rarevar.R

#!/usr/bin/env Rscript

#Rationale: data prep for rare-variant analysis in the RAP - phenotype-covariate file

library(tidyverse)
library(data.table)

bridge_file <- "/data/gen1/UKBiobank/application_88144/Bridge_eids_88144_56607.csv"
pheno_cov_file <- "/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/demo_EUR_pheno_cov_broadasthma.txt"

#match ID application 56607 to obtain phenotype-covariate file with ID application 88144:

bridge <- fread(bridge_file)
bridge$FID <- bridge$eid_88144
bridge$IID <- bridge$eid_88144
pheno_cov <- fread(pheno_cov_file) %>% select(-FID,-IID)
pheno_cov <- pheno_cov %>% rename("eid_56607" = eid)

pheno_cov_88144 <- left_join(pheno_cov,bridge,by="eid_56607") %>% relocate(IID, .before = missing) %>% relocate(FID, .before = IID)

write.table(pheno_cov_88144,"/scratch/gen1/nnp5/Var_to_Gen_tmp/rare_variant/demo_EUR_pheno_cov_broadasthma_app88144.txt",
row.names = FALSE, col.names = TRUE ,quote=FALSE, sep=" ", na = "NA")

src/rare_variant/README.txt

#Rationale: Micellaneous knowldege about dx toolkit, RAP and CLI to run the rare-variant single and gene-based collapsing analysis analysis step by step

#In REGENIE, phenotype and covariates column names:
file: /rfs/TobinGroup/data/UKBiobank/application_88144/demo_EUR_pheno_cov_broadasthma_app88144.txt
pheno="broad_pheno_1_5_ratio"
--covarColList age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex

#Download the CLI:
https://documentation.dnanexus.com/downloads

#Tutorial videos at:
https://dnanexus.gitbook.io/uk-biobank-rap/working-on-the-research-analysis-platform/accessing-and-using-the-command-line-interface

#When indexing files on the RAP in the CLI: always put the project ID, it makes it run faster:
project-GGzFY70JBJzVx22v4Yj980J1:/Bulk/Genotype Results/CEL files/11/

#Upload the phenotype file on the platform through dx toolkit:
dx login
dx logout
dx upload /rfs/TobinGroup/data/UKBiobank/application_88144/demo_EUR_pheno_cov_broadasthma_app88144.txt
dx describe #description of the tool

src/rare_variant/submit_rare_variant.sh

#!/bin/bash

#Rationale: run Single-variant rare-variant ExWAS in the RAP from command-line interface CLI, using dx toolkit syntax
#Command-line interface (script)
#Script form Kath:
#What follows is my bash script for running a GWAS (using regenie) on the command-line interface.  I used the genotyping data for the first step and the exome data for the second.  For more information on the command-line interface and how to use it, I'd recommend the videos on this page: https://dnanexus.gitbook.io/uk-biobank-rap/working-on-the-research-analysis-platform/accessing-and-using-the-command-line-interface
#https://dnanexus.gitbook.io/uk-biobank-rap/getting-started/research-analysis-platform-training-webinars

cd /home/n/nnp5/PhD/PhD_project/Var_to_Gene/

#Start of script for regenie GWAS:
dx login
# You will need to enter your username and password at this stage, and select the number corresponding to your project from the list
dx select Severe_asthma
dx mkdir analysis
data_file_dir="/analysis/"

# Merging chromosomes together (genotyping data):
#The RAP folders names have space on that ! so need to use '\' to specify that:
# demo_EUR_pheno_cov_broadasthma_app88144.txt is the phenotype file I generated on ALICE3.
# running locally in the CLI, and then it gives a job that I am able to monitor on the RAP.
run_merge="cp /mnt/project/Bulk/Genotype\ Results/Genotype\ calls/ukb22418_c[1-9]* . ; ls *.bed | sed -e 's/.bed//g'> files_to_merge.txt; plink --merge-list files_to_merge.txt --make-bed --autosome-xy --out ukb22418_c1_22_v2_merged; rm files_to_merge.txt;"
dx run swiss-army-knife -iin="/demo_EUR_pheno_cov_broadasthma_app88144.txt" -icmd="${run_merge}" --tag="Merge" --instance-type "mem1_ssd1_v2_x16" --destination="${data_file_dir}" --brief --yes

# QC-ing the merged genotyping data:
awk {'print $1, $2'} /rfs/TobinGroup/data/UKBiobank/application_88144/demo_EUR_pheno_cov_broadasthma_app88144.txt | tail -n +2 > /scratch/gen1/nnp5/Var_to_Gen_tmp/rare_variant/ukb_eur_ids
dx cd analysis
dx upload ukb_eur_ids
run_plink_qc="plink2 --bfile ukb22418_c1_22_v2_merged --keep ukb_eur_ids --autosome --maf 0.01 --mac 20 --geno 0.1 --hwe 1e-15 --mind 0.1 --write-snplist --write-samples --no-id-header --out snp_qc_pass"
dx run swiss-army-knife -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.bed" -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.bim" -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.fam" -iin="${data_file_dir}/ukb_eur_ids" -icmd="${run_plink_qc}" --tag="plink_QC_eur_ukb" --instance-type "mem1_ssd1_v2_x16" --destination="${data_file_dir}" --brief --yes

#  Running regenie step 1:
##Using data that represents the whole genome, either imputed or whole genome seq:
run_regenie_step1="regenie --step 1 \
--lowmem \
--out sa_results \
--bed ukb22418_c1_22_v2_merged \
--phenoFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
--covarFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
--extract snp_qc_pass.snplist \
--phenoCol broad_pheno_1_5_ratio \
--covarCol age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex \
--bsize 1000 \
--bt \
--loocv \
--gz \
--threads 16"
dx run swiss-army-knife -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.bed" -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.bim" -iin="${data_file_dir}/ukb22418_c1_22_v2_merged.fam" -iin="/demo_EUR_pheno_cov_broadasthma_app88144.txt" -iin="${data_file_dir}/snp_qc_pass.snplist" -icmd="${run_regenie_step1}" --tag="Step1" --instance-type "mem1_ssd1_v2_x16" --destination="${data_file_dir}" --brief --yes

# QC-ing the exome data:
exome_file_dir="/Bulk/Exome sequences/Population level exome OQFE variants, PLINK format - final release"
field_name="ukb23158"
for chr in {1..22}
do
  run_plink_wes="plink2 \
  --bfile ukb23158_c${chr}_b0_v1 \
  --no-psam-pheno \
  --keep ukb_eur_ids \
  --autosome \
  --mac 3 \
  --geno 0.1 \
  --hwe 1e-15 \
  --mind 0.1 \
  --write-snplist \
  --write-samples \
  --no-id-header \
  --out WES_c${chr}_snps_qc_pass"
  dx run swiss-army-knife \
  -iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.bed" \
  -iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.bim" \
  -iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.fam" \
  -iin="${data_file_dir}/ukb_eur_ids" \
  -icmd="${run_plink_wes}" \
  --tag="QC_ExWAS" \
  --instance-type "mem1_ssd1_v2_x16" \
  --name "QC_chr${chr}" \
  --destination="${data_file_dir}" \
  --brief \
  --yes
done

# Running regenie step 2:
exome_file_dir="/Bulk/Exome sequences/Population level exome OQFE variants, PLINK format - final release"
EXCLUDE="ukb23158_500k_OQFE.90pct10dp_qc_variants.txt"
field_name="ukb23158"
for chr in {1..22}
do
  run_regenie_cmd="regenie \
  --step 2 \
  --bed ukb23158_c${chr}_b0_v1 \
  --out ExWAS_SA_assoc.c${chr} \
  --phenoFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
  --covarFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
  --bt \
  --approx \
  --firth-se \
  --firth \
  --extract WES_c${chr}_snps_qc_pass.snplist \
  --exclude $EXCLUDE \
  --phenoCol broad_pheno_1_5_ratio \
  --covarCol age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex \
  --bsize 400 \
  --pred sa_results_pred.list \
  --pThresh 0.05 \
  --minMAC 3 \
  --threads 16 \
  --gz"
  dx run swiss-army-knife \
  -iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.bed" \
  -iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.bim" \
  -iin="${exome_file_dir}/${field_name}_c${chr}_b0_v1.fam" \
  -iin="${data_file_dir}/WES_c${chr}_snps_qc_pass.snplist" \
  -iin="${exome_file_dir}/helper_files/${EXCLUDE}" \
  -iin="/demo_EUR_pheno_cov_broadasthma_app88144.txt" \
  -iin="${data_file_dir}/sa_results_pred.list" \
  -iin="${data_file_dir}/sa_results_1.loco.gz" \
  -icmd="${run_regenie_cmd}" \
  --tag="Step2" \
  --instance-type "mem1_ssd1_v2_x16" \
  --name="regenie_step2_chr${chr}" \
  --destination="${data_file_dir}" \
  --brief \
  --yes
done

#Downlaod a log file to have it locally on ALICE3:
#Var_to_Gene/input/regenie_step2_chr5:
#REGENIE v3.1.1.gz
#* case-control counts for each trait:
#'broad_pheno_1_5_ratio': 7413 cases and 36955 controls

# Merging output of regenie and formatting for visualising using LocusZoom:
merge_cmd='out_file="assoc.regenie.merged.txt"; cp /mnt/project/analysis/*.regenie.gz . ; gunzip *.regenie.gz ; echo -e "#CHROM\tGENPOS\tID\tALLELE0\tALLELE1\tA1FREQ\tN\tTEST\tBETA\tSE\tCHISQ\tLOG10P\tEXTRA" > $out_file ; files="./*.regenie"; for f in $files; do tail -n+2 $f | tr " " "\t" >> $out_file; done ; rm *.regenie'
dx run swiss-army-knife \
    -iin="${data_file_dir}/ExWAS_SA_assoc.c1_broad_pheno_1_5_ratio.regenie.gz" \
    -icmd="${merge_cmd}" \
    --tag="Merge_regenie_results" \
    --instance-type "mem1_ssd1_v2_x16" \
    --destination="${data_file_dir}" \
    --brief \
    --yes
done

#Run LocusZoom:
#On the RAP Platform:
#Tools > Tools Library > LocusZoom > Run
#Output folder: Severe_asthma/analysis/
#Input files: Severe_asthma/analysis/assoc.regenie.merged.txt"
#JobName: LocusZoom_single_rarevar_ExWAS
#and then WORKER URL to select the column:
#Build38
#Chromosome: CHROM
#Ref allele: ALLELE0
#P-value column: LOG10P tick 'is -log10(p)'
#Position: GENPOS
#Alt alelle: ALLELE1
#Effect size: BETA
#Std Err.: SE
#Affect allele: Alt
#Specify from: Frequency
#Frequency: A1FREQ
#Dowload summary statistics, log, screenshot of LocusZoom and QQPlot and upload in /Var_to_Gene/input/rare_variant/single_rarevar_ExWAS/

#Let's filter our data first
#MAF < 0.01 (Because for my common variants SA GWAS, I filter for MAF >= 0.01) and p-value <= 5E-6 (as suggestive threshold):

Rscript src/rare_variant/create_input_munge_summary_stats.R \
    input/rare_variant/single_rarevar_EwWAS/summary_stats.gz \
    "SA"

#Look-up of exonic rare variants (MAF < 0.01) within +/- 500Kb of credible set variants for each locus single variant and gene-based results.
#Genes of exonic rare variants in the regions with suggestive p-value <= 5E-6 were identified.
for line in {1..615}
do
chr=$(awk -v row="$line" ' NR == row {print $1 } ' input/cs_vars_liftover_output.bed | sed 's/chr//g')
pos=$(awk -v row="$line" 'NR == row {print $2 }' input/cs_vars_liftover_output.bed)
awk -v chr_idx=$chr -v pos_idx=$pos '$2 == chr_idx &&  $3 >= pos_idx-500000 && $3 <= pos_idx+500000 {print}' input/rare_variant/single_rarevar_EwWAS/SArarevar_suggestive
done

##no Look-up results

#Discovery analysis of significant associated variants:
#Significant threshold: 0.05/N-markers: 0.05/1772264
pval_thr=$(0.05/1772264)

Rscript src/rare_variant/sentinel_selection.R \
    input/rare_variant/single_rarevar_EwWAS/SArarevar_betase_input_mungestat \
    "SA"\
    500000 \
    0.05 \ #nominal threshold
    1772264 #N_markers for correction

src/rare_variant/create_input_munge_summary_stats.R

#!/usr/bin/env Rscriptcd all

#Rationale filter out rare variants form ExWAS.

library(data.table)
library(tidyverse)

args = commandArgs(trailingOnly=TRUE)
df_file <- args[1]
pheno <- as.character(args[2])

df <- fread(df_file,header=F,sep="\t")

#"#chrom"          "pos"             "rsid"            "ref"
#"alt"             "neg_log_pvalue"  "beta"            "stderr_beta"
#"alt_allele_freq"
#CHROM GENPOS ID ALLELE0 ALLELE1 A1FREQ INFO N TEST BETA SE CHISQ LOG10P
#REGENIE: reference allele (allele 0), alternative allele (allele 1)
#REGENIE:  estimated effect sizes (for allele 1 on the original scale)

colnames(df) <- c("CHROM", "GENPOS", "ID", "ALLELE0", "ALLELE1", "LOG10P", "BETA", "SE", "A1FREQ")
dfclean <- df %>% select(CHROM,ID,GENPOS,ALLELE1,ALLELE0,A1FREQ,LOG10P,BETA,SE)

dfclean$pval <- 10^(-as.numeric(dfclean$LOG10P))

#select column in order:
dfclean <- dfclean %>% select(ID,CHROM,GENPOS,ALLELE1,ALLELE0,BETA,SE,A1FREQ,pval)

#rename columns :
colnames(dfclean) <- c("snpid", "chr", "bp38", "a1", "a2", "LOG_ODDS", "se", "eaf", "pval")

#make sure eaf column is numeric:
dfclean$eaf <- as.numeric(dfclean$eaf)

#create MAF columns and filtered out for MAF >= 0.01:
dfclean <- dfclean %>% mutate(MAF = ifelse(dfclean$eaf <= 0.5, dfclean$eaf, 1 - dfclean$eaf))
dfclean_rarevar <- dfclean %>% filter(MAF < 0.01)

#save gwas results for variants with MAF < 0.01:
write.table(dfclean_rarevar,paste0("input/rare_variant/single_rarevar_EwWAS/",pheno,"rarevar_betase_input_mungestat"),quote=FALSE,sep="\t",row.names=FALSE)

#save gwas results for variants with MAF < 0.01 and pvalue <= 0.000005:
dfclean_rarevar_sugg <- dfclean_rarevar %>% filter(pval <= 0.000005)
write.table(dfclean_rarevar_sugg,paste0("input/rare_variant/single_rarevar_EwWAS/",pheno,"rarevar_suggestive"),quote=FALSE,sep="\t",row.names=FALSE)

src/rare_variant/submit_collapsing_bt.sh

#!/bin/bash
#Rationale: gene-based collapsing rare variant ExWAS - Based on Nick's script
#guidelines at:
#https://dnanexus.gitbook.io/uk-biobank-rap/science-corner/using-regenie-to-generate-variant-masks
#https://community.dnanexus.com/s/question/0D582000004zTItCAM/burden-test-from-regenie-in-ukb-wes-data-the-result-does-not-contain-alleles-or-beta-and-se
#https://github.com/rgcgithub/regenie/issues/327

dx login
dx select Severe_asthma


dx mkdir /analysis/collapsing_Hg38/
dx cd /analysis/collapsing_Hg38/
dx upload /scratch/gen1/nrgs1/rare_variant/collapsing_Hg38/ukb23158_500k_OQFE.annotations.txt.gz
dx upload /scratch/gen1/nrgs1/rare_variant/collapsing_Hg38/ukb23158_500k_OQFE.sets.txt.gz

#Create the backmask.def file:
printf "M1 LoF\nM3 LoF,missense(5/5)\nM4 LoF,missense(5/5),missense(>=1/5)" > /scratch/gen1/nnp5/Var_to_Gen_tmp/rare_variant/backmask.def
dx upload /scratch/gen1/nnp5/Var_to_Gen_tmp/rare_variant/backmask.def
dx cd ../..

SEED=/analysis/
BASE=/collapsing_Hg38/
EXOME_PATH="/Bulk/Exome sequences/Population level exome OQFE variants, PLINK format - final release"
ANNOT=ukb23158_500k_OQFE.annotations.txt.gz
SETLIST=ukb23158_500k_OQFE.sets.txt.gz
EXCLUDE=ukb23158_500k_OQFE.90pct10dp_qc_variants.txt
MASK=backmask.def
AAF_BINS=0.01,0.001,0.0001,0.00001
JOINT_TESTS=acat
MAXAFF=0.01
TESTS=acatv,skato
THREADS=16

if [ $# -gt 1 ]; then
    BUILD_MASK="--build-mask sum"
    OUT=${OUT}_sum
    NAME=${NAME}_sum
else
    BUILD_MASK="--write-mask --write-mask-snplist"
fi

for CHR in {1..22}
do
  NAME=sa_collapsing_chr${CHR}_gene_p
  BED=ukb23158_c${CHR}_b0_v1
  OUT=${BED}_sa_collapsing_backman_gene_p
  REGENIE_CMD="regenie \
    --bed $BED \
    --exclude $EXCLUDE \
    --extract WES_c${CHR}_snps_qc_pass.snplist \
    --step 2 \
    --pred sa_results_pred.list \
    --phenoFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
    --covarFile demo_EUR_pheno_cov_broadasthma_app88144.txt \
    --phenoCol broad_pheno_1_5_ratio \
    --covarCol age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex \
    --bt \
    --anno-file ${ANNOT} \
    --set-list ${SETLIST} \
    --mask-def ${MASK} $BUILD_MASK \
    --firth --approx --pThresh 0.01 \
    --bsize 400 \
    --minMAC 3 \
    --aaf-bins $AAF_BINS \
    --joint $JOINT_TESTS \
    --vc-maxAAF $MAXAFF \
    --vc-tests $TESTS \
    --rgc-gene-p \
    --threads=$THREADS \
    --out $OUT"
  dx run swiss-army-knife \
    -iin="${EXOME_PATH}/${BED}.bed" \
    -iin="${EXOME_PATH}/${BED}.bim" \
    -iin="${EXOME_PATH}/${BED}.fam" \
    -iin="${SEED}/WES_c${CHR}_snps_qc_pass.snplist" \
    -iin="/demo_EUR_pheno_cov_broadasthma_app88144.txt" \
    -iin="${SEED}/sa_results_pred.list" \
    -iin="${SEED}/sa_results_1.loco.gz" \
    -iin="${SEED}/${BASE}/${ANNOT}" \
    -iin="${EXOME_PATH}/helper_files/${EXCLUDE}" \
    -iin="${SEED}/${BASE}/${SETLIST}" \
    -iin="${SEED}/${BASE}/${MASK}" \
    -icmd="$REGENIE_CMD" \
    --instance-type "mem1_ssd1_v2_x16" \
    --name="$NAME" \
    --destination="${SEED}/${BASE}" \
    --brief --yes --allow-ssh
done

#Downlaod files in ALICE3:
mkdir /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/genecollaps_ExWAS
dx download project-GGzFY70JBJzVx22v4Yj980J1:/analysis/collapsing_Hg38/ukb23158_c*

#Find gene p-value <= 5^10-6:
mkdir /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/rare_variant/genecollap_ExWAS
awk '$12 = 10^(-$12)' \
    /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/genecollaps_ExWAS/ukb23158_c*_b0_v1_sa_collapsing_backman_gene_p_broad_pheno_1_5_ratio.regenie | \
    grep -v "^#" | awk 'NR > 1 && $12 <= 0.000005 {print}' \
    > input/rare_variant/genecollap_ExWAS/SArarevar_genecollap_suggestive

##Look-up of exonic rare variants (MAF < 0.01) within +/- 500Kb of credible set variants for each locus single variant and gene-based results.
##Genes of exonic rare variants in the regions with suggestive p-value <= 5E-6 were identified.
for line in {1..615}
do
chr=$(awk -v row="$line" ' NR == row {print $1 } ' input/cs_vars_liftover_output.bed | sed 's/chr//g')
pos=$(awk -v row="$line" 'NR == row {print $2 }' input/cs_vars_liftover_output.bed)
awk -v chr_idx=$chr -v pos_idx=$pos '$1 == chr_idx &&  $2 >= pos_idx-500000 && $3 <= pos_idx+500000 {print}' input/rare_variant/genecollap_ExWAS/SArarevar_genecollap_suggestive
done

##no Look-up results

I obtained results for single variant and gene-based exome-wide association analysis including 7,413 cases and 36,955 controls, and covariates age_at_recruitment,age2,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,genetic_sex. ExWASes were performed using the tool REGENIE v3.1.1 and the Research Analysis Platform (RAP, https://ukbiobank.dnanexus.com/landing).
To identify rare variant suggestive association and mapped genes, I implemented a look-up analysis for exonic rare variants (MAF < 0.01) within +/- 500Kb from common credible set variants.

Variant-to-Locus-to-Gene

To combine the results from all the distinct analyses and see which genes were highlighted:
src/genes_heatmap.R

#!/usr/bin/env Rscript

#Rationale: visualisation of genes prioritised from all analyses

library(scales)
library(tidyverse)
library(data.table)
library(readxl)
library(RColorBrewer)


#input each analysis table:
genes_raw <- "input/var2genes_raw.xlsx"
nearest_gene <- read_excel(genes_raw, sheet = "nearest_genes",col_names = "gene")
nearest_gene$evidence <- "nearest"
annotation <- read_excel(genes_raw, sheet = "varannot_genes",col_names = "gene")
annotation$evidence <- as.factor("func_annot")
eqtl <- read_excel(genes_raw, sheet = "eQTL_genes_merge")
pqtl <- read_excel(genes_raw, sheet = "pQTL_genes_merge")
pops <- read_excel(genes_raw, sheet = "PoPS_genes",col_names = "gene")
pops$evidence <- as.factor("PoPS")
mouseko <- read_excel(genes_raw, sheet = "Mouseko_genes",col_names = "gene")
mouseko$evidence <- as.factor("mouse_KO")
raredis <- read_excel(genes_raw, sheet = "Raredisease_genes",col_names = "gene")
raredis$evidence <- as.factor("rare_disease")

#merge:
v2g_full <- rbind(nearest_gene,annotation,eqtl,pqtl,pops,mouseko,raredis)

#table with all the possible combination:
v2g_full_combination <- unique(expand.grid(x = v2g_full$gene, y = v2g_full$evidence, KEEP.OUT.ATTRS = TRUE)) %>% arrange(x)
colnames(v2g_full_combination) <- colnames(v2g_full)
#mask (conceptually: v2g_full_combination %in% v2g_full):
mask <- as.data.frame(do.call(paste0, v2g_full_combination) %in% do.call(paste0, v2g_full)) %>% rename(status='do.call(paste0, v2g_full_combination) %in% do.call(paste0, v2g_full)')
v2g_full_combination <- cbind(v2g_full_combination,mask) %>% mutate(status=as.integer(as.logical(status))) %>% arrange(status,gene)
#long to wide reshape using spread() in tidyr:
v2g_full_combination <- spread(v2g_full_combination, evidence, status) %>% remove_rownames %>% column_to_rownames(var="gene")

#  Convert row names into first column
v2g_full_combination <- setDT(v2g_full_combination, keep.rownames = TRUE)[]
setnames(v2g_full_combination, "rn", "gene")

# reshape your data
v2g_full_combination2 <- melt(v2g_full_combination, id.var = "gene")
length(unique(v2g_full_combination2$gene))
fwrite(v2g_full_combination2,"./output/v2g_gene_prioritisation.txt",sep="\t",quote=F)

# Plot
##use this to find which colour I want: fro the pie, I extract the index for the colour I want:
##everytime it changes, so I noted down the actual code of the colour: ##18: col[18] #9EBCDA
#n <- 30
#colrs <- brewer.pal.info[brewer.pal.info$colorblind == TRUE, ]
#col_vec = unlist(mapply(brewer.pal, colrs$maxcolors, rownames(colrs)))
#col <- sample(col_vec, n)
#area <- rep(1,n)
#pie(area, col = col)

#fnct for the plot:
fc_heatmap_all <- function(df,x_val,y_val,fill_val) {
      ggplot(df, aes(x=x_val, y=y_val, fill=fill_val)) +
      geom_tile() +
      scale_fill_gradient2(low = "white",
                       mid = "white",
                       high = "#9EBCDA") +
      scale_x_discrete(position = "top") +
      theme(axis.text = element_text(face = "bold"), axis.ticks.y=element_blank(), axis.ticks.x=element_blank()) +
      xlab("Evidence") + ylab("Gene") + theme(legend.position = "none") +
      ggtitle("Gene prioritization")
      }


#Split plot into 9 plots of 11 genes each:
test <- v2g_full_combination2 %>%
    arrange(gene, variable) %>%
    group_by() %>%
    mutate(facet=c(rep(9, ceiling(n()/9)),rep(8, ceiling(n()/9)),rep(7, ceiling(n()/9)),rep(6, ceiling(n()/9)),rep(5, ceiling(n()/9)),rep(4, ceiling(n()/9)),rep(3, ceiling(n()/9)),rep(2, ceiling(n()/9)), rep(1, floor(n()/9)))) %>%
    ungroup

png("./output/V2G_heatmap_subplots.png", width=1000, height = 900)
fc_heatmap_all(test,test$variable,test$gene,test$value) + facet_wrap(~facet, scales="free", ncol=3) +
        theme(strip.background = element_blank(), strip.text = element_blank(),
        axis.text.x=element_text(angle=75, vjust=0, hjust=0, face="bold"),
        axis.title=element_blank(),
        axis.text.y=element_text(hjust=1, face="italic"),
        axis.ticks = element_blank(),
        legend.title = element_blank())
dev.off()

To understand locus-gene(s) association:
src/Locus_to_genes_table.R

#!/usr/bin/env Rscript

#Rationale: create table with results from each individual analysis:
#evidence   signal  gene    locus   start   end chr pos trait   effect  other   eaf Z   P   Novel   width

library(tidyverse)
library(data.table)
library(writexl)
library(readxl)

#Cred_set variants:
#in credset, a1 is actually allele2 in gwas; a2 is actually allele1 in gwas
credset <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt") %>% select(-PIP_finemap,-PIP_susie) %>% rename(posb37=position,a1=allele2,a2=allele1,chr=chromosome)
#add posb38:
b38 <- fread("input/cs_vars_liftover_output.bed")
colnames(b38) <- c("chr_b38","posb38","pos1_b38")
credset_b38 <- cbind(credset,b38) %>% select(-chr_b38, -pos1_b38) %>%  relocate(posb38, .after = posb37)

#add gwas info to credset with also posb38:
gwas <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat") %>% rename(chr=b37chr,posb37=bp)
credset_gwas <- credset_b38 %>% left_join(gwas,by=c("chr","posb37","a1","a2","snpid"))
credset_gwas <- credset_gwas %>% arrange(chr,posb37)
credset_gwas$sentinel <- paste0(credset_gwas$chr,"_",credset_gwas$posb37,"_",credset_gwas$a2,"_",credset_gwas$a1)


##Nearest gene:
ng <- fread("output/PIP_sentinels_nearestgenes") %>% select(locus,sentinel,Nearest_gene)
ng$evidence <- as.factor("nearest_gene")
credset_gwas_ng <- credset_gwas %>% left_join(ng, by=c("sentinel","locus"))
##extract only variants with evidence for nearest gene - the sentinel ones cus I did nearest gene by locus:
credset_gwas_ng2 <- credset_gwas_ng %>% filter(!is.na(evidence))

##Functional annotation:
#fantom5
fantom5 <- fread("output/fnc_annot_fantom5") %>% rename(chr=Chromosome,posb38=Position)
fantom5$evidence <- as.factor("fantom5")
credset_gwas_fantom5 <- credset_gwas %>% left_join(fantom5, by=c("chr","posb38"))

#integrative scores:
inscores.aPCs <- fread("output/fnc_annot_inscores") %>% rename(chr=Chromosome,posb38=Position)
inscores.aPCs$evidence <- as.factor("inscores.aPCs")
credset_gwas_inscores.aPCs <- credset_gwas %>% left_join(inscores.aPCs, by=c("chr","posb38"))

#ClinVar:
clinvar <- fread("output/fnc_annot_clinvar") %>% rename(chr=Chromosome,posb38=Position)
clinvar$evidence <- as.factor("clinvar")
credset_gwas_clinvar <- credset_gwas %>% left_join(clinvar, by=c("chr","posb38"))

##Merge together the three functional criteria:
col_for_join <- c("locus","snpid","chr","posb37","posb38","a2","a1","PIP_average","LOG_ODDS","se","eaf","pval","MAF","sentinel","evidence")
fantom5_inscores <- full_join(credset_gwas_fantom5,credset_gwas_inscores.aPCs,by=col_for_join)
fantom5_inscores_clinvar <- full_join(fantom5_inscores,credset_gwas_clinvar,by=col_for_join) %>%
                            filter(!is.na(evidence)) %>%
                            unite(col='Gene', c('Nearest_gene', 'Genecode.Comprehensive.Info', 'Gene.Reported'),na.rm=TRUE)

##eQTL colocalisation:
#GTExV8:
gtex_coloc <- fread("output/coloc_asthma_GTEx.tsv") %>% select(snp, gene, tissue)
gtex_colocsusie <- fread("output/colocsusie_asthma_GTEx.tsv") %>% select(snp, gene, tissue)
gtex <- rbind(gtex_coloc, gtex_colocsusie)  %>% rename(sentinel_gtex=snp)
gtex$evidence <- as.factor("eqtl_gtex")
credset_gwas$sentinel_gtex <- paste0(credset_gwas$chr,"_",credset_gwas$posb37,"_",credset_gwas$a1,"_",credset_gwas$a2)
credset_gwas_gtex <- credset_gwas %>% left_join(gtex, by="sentinel_gtex")
credset_gwas_gtex2 <- credset_gwas_gtex %>% filter(!is.na(evidence)) %>% rename(gene_ensembl=gene)
#put gene symbol instead of ensembl:
gtex_genesymbol <- read_excel("input/var2genes_raw.xlsx", sheet="GTExV8_eQTL_genes_symbol",col_names=F) %>% rename(gene_ensembl="...1",gene="...2")
credset_gwas_gtex2 <- credset_gwas_gtex2 %>% left_join(gtex_genesymbol,by="gene_ensembl") %>% select(-gene_ensembl)


#eqtlGen:
eqtlgen<- fread("output/coloc_asthma_eqtlgen.tsv") %>% select(snp, gene, tissue) %>% rename(sentinel_eqtlgen=snp)
eqtlgen$evidence <- as.factor("eqtl_eqtlgen")
credset_gwas$sentinel_eqtlgen <- paste0(credset_gwas$chr,"_",credset_gwas$posb37,"_",credset_gwas$a1,"_",credset_gwas$a2)
credset_gwas_eqtlgen <- credset_gwas %>% left_join(eqtlgen, by="sentinel_eqtlgen")
credset_gwas_eqtlgen2 <- credset_gwas_eqtlgen %>% filter(!is.na(evidence))

#UBCLung:
ubclung <- fread("output/coloc_asthma_ubclung.tsv") %>% select(snp, gene_id, tissue) %>% rename(sentinel_ubclung=snp)
ubclung$evidence <- as.factor("eqtl_ubclung")
credset_gwas$sentinel_ubclung <- paste0(credset_gwas$chr,"_",credset_gwas$posb37,"_",credset_gwas$a1,"_",credset_gwas$a2)
credset_gwas_ubclung <- credset_gwas %>% left_join(ubclung, by="sentinel_ubclung")
credset_gwas_ubclung2 <- credset_gwas_ubclung %>% filter(!is.na(evidence))

##pQTL look-up:
#UKB pQTL:
ukbpqtl <- fread("output/lookup_ukbpqtl.txt")
ukbpqtl <- ukbpqtl %>% separate(LOCUS, c("locus", "sentinel_ukbpqtl"), sep = "/")
ukbpqtl <- ukbpqtl %>% rename(chr=CHROM,posb38=GENPOS)
ukbpqtl$evidence <- as.factor("pqtl_ukb")
credset_gwas_ukbpqtl <- credset_gwas %>% left_join(ukbpqtl, by=c("locus","chr","posb38"))
credset_gwas_ukbpqtl2 <- credset_gwas_ukbpqtl %>% filter(!is.na(evidence))

#SCALLOP:
#Colnmaes:#Chrom    Pos MarkerName  Allele1 Allele2 Freq1   FreqSE  Effect  StdErr  P-value Direction   TotalSampleSize Gene
scallop <- fread("output/scallop_ukbpqtl.txt",head=F,sep="\t")
scallop <- scallop %>% separate(V12, c("TotalSampleSize","Gene"),sep=" ")
colnames(scallop) <- c("Chrom","Pos","MarkerName","Allele1","Allele2","Freq1","FreqSE","Effect", "StdErr","P-value","Direction","TotalSampleSize","Gene")
scallop <- scallop %>% rename(chr=Chrom,posb37=Pos)
scallop$evidence <- as.factor("pqtl_scallop")
credset_gwas_scallop <- credset_gwas %>% left_join(scallop, by=c("chr","posb37"))
credset_gwas_scallop2 <- credset_gwas_scallop %>% filter(!is.na(evidence))

##PoPS:
pops <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/pops/results/all_results_merged_table.txt")
pops <- pops %>% rename(locus=signal_id) %>% relocate(gene_symbol, .before = ENSGID)
pops$evidence <- as.factor("pops")
credset_gwas_pops <- credset_gwas %>% left_join(pops, by=c("locus","sentinel"))
credset_gwas_pops2 <- credset_gwas_pops %>% filter(!is.na(evidence))


##Mouse_ko
mko <- fread("input/mko_results_500kb.csv") %>% filter(overlap == TRUE) %>% select(-Position, -overlap, -MKO) %>% rename(posb37=pos) %>% arrange(chr,posb37)
mko$evidence <- as.factor("mouse_ko")
credset_gwas_mko <- credset_gwas %>% left_join(mko, by=c("locus","sentinel","chr","posb37"))
credset_gwas_mko2 <- credset_gwas_mko %>% filter(!is.na(evidence))

##Rare disease
raredis <- fread("input/raredis_results_500kb_by_gene.csv")
colnames(raredis) <- c("Symbol","sentinel","frequency_disease","distance","name_disease","HPOterm")
raredis$evidence <- as.factor("rare_disease")
credset_gwas_raredis <- credset_gwas %>% left_join(raredis, by= "sentinel")
credset_gwas_raredis2 <- credset_gwas_raredis %>% filter(!is.na(evidence))

#Merge the results altogether to obtain a single file with locus - snp - gene - evidence:
#only thing that can be different are 'evidence' and 'gene'
#and 'tissue' only for eQTL data:
col_for_join <- c("locus","snpid","chr","posb37","posb38","a2","a1","PIP_average","LOG_ODDS","se","eaf","pval","MAF","sentinel","evidence","gene","tissue")
credset_gwas_gtex2 <- credset_gwas_gtex2 %>% rename(gene=gene)
credset_gwas_eqtlgen2 <- credset_gwas_eqtlgen2 %>% rename(gene=gene)
credset_gwas_ubclung2 <- credset_gwas_ubclung2 %>% rename(gene=gene_id)

eqtl_all <- credset_gwas_gtex2 %>%
            full_join(credset_gwas_eqtlgen2,by=col_for_join)%>%
            full_join(credset_gwas_ubclung2, by=col_for_join) %>%
            select("locus","snpid","chr","posb37","posb38","a2","a1","PIP_average","LOG_ODDS","se","eaf","pval","MAF","sentinel","evidence","gene","tissue")

col_for_join <- c("locus","snpid","chr","posb37","gene","evidence","posb38","a2","a1","PIP_average","LOG_ODDS","se","eaf","pval","MAF","sentinel")
credset_gwas_ng2 <- credset_gwas_ng2 %>% rename(gene=Nearest_gene)
fantom5_inscores_clinvar <- fantom5_inscores_clinvar %>% rename(gene=Gene)
credset_gwas_ukbpqtl2 <- credset_gwas_ukbpqtl2 %>% rename(gene=PROTEIN)
credset_gwas_scallop2 <- credset_gwas_scallop2 %>% rename(gene=Gene)
credset_gwas_raredis2 <- credset_gwas_raredis2 %>% rename(gene=Symbol)
credset_gwas_mko2 <- credset_gwas_mko2 %>% rename(gene=Symbol)
credset_gwas_pops2 <- credset_gwas_pops2 %>% rename(gene=gene_symbol)
v2g_all <- eqtl_all %>%
           full_join(credset_gwas_ng2,by=col_for_join) %>%
           full_join(fantom5_inscores_clinvar,by=col_for_join) %>%
           full_join(credset_gwas_ukbpqtl2,by=col_for_join) %>%
           full_join(credset_gwas_scallop2,by=col_for_join) %>%
           full_join(credset_gwas_raredis2,by=col_for_join) %>%
           full_join(credset_gwas_mko2,by=col_for_join) %>%
           full_join(credset_gwas_pops2,by=col_for_join) %>% select(all_of(col_for_join)) %>% arrange(chr,posb37,locus,gene,evidence) %>% unique()

v2g_minimal <- v2g_all %>% select(locus,snpid,chr,posb37,gene) %>% select(locus,gene) %>% unique()

#Save each results for each analysis into a tables to populate a xlsx file:
df_list <- list(v2g_minimal,v2g_all,credset_gwas_ng2,fantom5_inscores_clinvar,credset_gwas_gtex2,credset_gwas_eqtlgen2,
credset_gwas_ubclung2,credset_gwas_ukbpqtl2,credset_gwas_scallop2,credset_gwas_raredis2,credset_gwas_mko2,credset_gwas_pops2)
write_xlsx(df_list,path = "src/report/var2gene_full.xlsx", col_names = TRUE, format_headers = TRUE)

##Rare variant ExWAS: (Single rare-variant and Gene-collpasing rare variant): no genes/results form these analyses

src/Region_plot_V2G.sh

#!/bin/bash

#SBATCH --job-name=regionplot
#SBATCH --output=/scratch/gen1/nnp5/Var_to_Gen_tmp/logerror/%x-%j.out
#SBATCH --time=1:0:0
#SBATCH --mem=30gb
#SBATCH --account=gen1
#SBATCH --export=NONE

#Rationale: visualise V2G results in the different credible sets locus- R2 with regards to SNP with highest average PIP.
module load R

#workdir: Var_to_Gene/
PATH_data="/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data"
#mkdir /scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot
PATH_TMP="/scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot"

##digest FAVOR files:
#awk -F "," '{print $1, $2, $3, $8, $9, $10, $11, $12}' input/FAVOR_credset_chrpos38_2023_08_08.csv \
#    > ${PATH_TMP}/FAVOR_credset_annotations_digest_08_08_23.csv


##Calculate R2 with respect to variant with highest PIP in each locus:
##Use the 10,000 European ancestry individuals in /data/gen1/LF_HRC_transethnic/LD_reference/EUR_UKB:
##find all credible set variants in EUR_UKB by chr and pos (alleles can be flipped):
#awk '{print $3" "$3"_"$4"_"}' /data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/replsugg_valid_credset.txt | \
#    sed 's/ //g' | \
#    grep -F -f - /data/gen1/LF_HRC_transethnic/LD_reference/EUR_UKB/ukb_imp_chr*_EUR_selected_nodups.bim | \
#    awk '{print $2}' > ${PATH_TMP}/cs_variants_EUR_UKB

##find the SNPs in EUR_UKB by chr and pos (alleles can be flipped):
#awk '{print $2}'input/highest_PIP_sentinels | \
#    awk -F "_" '{print $1, $1"_"$2"_"}' | sed 's/ /   /g' | \
#    grep -F -f - /data/gen1/LF_HRC_transethnic/LD_reference/EUR_UKB/ukb_imp_chr*_EUR_selected_nodups.bim | \
#    awk '{print $2}' > ${PATH_TMP}/highest_PIP_sentinels_EUR_UKB

for line in {1..17}
do
SNP=$(awk -v row="$line" ' NR == row {print $0 } ' ${PATH_TMP}/highest_PIP_sentinels_EUR_UKB)
chr=$(awk -F "_" -v row="$line" ' NR == row {print $1 } ' ${PATH_TMP}/highest_PIP_sentinels_EUR_UKB)
SNP_tmp=$(awk -F "_" -v row="$line" ' NR == row {print $1"_"$2 } ' ${PATH_TMP}/highest_PIP_sentinels_EUR_UKB)
start=$(grep ${SNP_tmp}"_" input/highest_PIP_sentinels | awk '{print $1}' | awk -F "_" '{print $(NF-1)}')
end=$(grep ${SNP_tmp}"_" input/highest_PIP_sentinels | awk '{print $1}' | awk -F "_" '{print $(NF)}')

#create R2 according to leading p-value:
#Calculate R2 with respect to the leading SNP:
#module unload plink2/2.00a
#module load plink
#plink --bfile /data/gen1/LF_HRC_transethnic/LD_reference/EUR_UKB/ukb_imp_chr${chr}_EUR_selected_nodups \
#    --chr ${chr} --from-bp ${start} --to-bp ${end} \
#    --allow-no-sex --r2 inter-chr --ld-snp ${SNP} --ld-window-r2 0 \
#    --out ${PATH_TMP}/${SNP}_${start}_${end}_ld_file

Rscript src/Region_plot_V2G_2.R \
    ${PATH_TMP}/${SNP}_${start}_${end}_ld_file.ld \
    output/region_plots_V2G/rp_v2g_${chr}_${SNP}_${start}_${end}.pdf \
    ${chr}_${SNP}_${start}_${end} \
    ${start} ${end} ${chr} ${SNP}

done

src/Region_plot_V2G.R

#!/usr/bin/env Rscript


#Rationale: plot of Genomic Locus and variant-to-gene mapping genes:
##variants in credible set, with lead variant as the one with highest PIP
##functional annotation for only credible set variants, using as by FAVOR
##R2 between variants in the region
##Gene highlighted by the variant-to-gene mapping analysis with the type of evidence

library(tidyverse)
library(data.table)
library(dplyr)

annot <- read.table("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/FAVOR_credset_chrpos38_2023_08_08.csv", sep=",", stringsAsFactors = FALSE, fill=TRUE, header=TRUE)
annot <- annot %>% select("Variant..VCF.","Chromosome","Position","Genecode.Comprehensive.Category") %>%
    rename(id="Variant..VCF.",chromosome="Chromosome",posb38="Position")
annot <- as.data.frame(sapply(annot, function(x) gsub("\"", "", x)))
annot <- annot %>% rename(Functional_annotation=Genecode.Comprehensive.Category)

#liftover online:
#awk -F "_" '{print "chr"$1,$2,$2}' /scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot/cs_variants_EUR_UKB > /home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/cs_variants_EUR_UKB_lifover_input
credset <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot/cs_variants_EUR_UKB",header=FALSE)
credset <- credset %>% separate(V1, "_", into=c("chromosome", "posb37", "allele1", "allele2"))
b38 <- fread("/home/n/nnp5/PhD/PhD_project/Var_to_Gene/input/cs_variants_EUR_UKB_liftover_output.bed",header=FALSE)
b38 <- b38 %>% separate(V4, "-", into=c("chr_pos", "posb37"))
b38$chr_pos <- NULL
b38$posb37 <- as.numeric(b38$posb37)
credset$V1 <- paste0("chr",credset$chromosome)
credset$posb37 <- as.numeric(credset$posb37)
credset_b38 <- left_join(credset,b38,by=c("V1","posb37")) %>% select(-V1, -V3, -V5)
credset_b38 <- credset_b38 %>% rename(posb38=V2)
credset_b38$chromosome <- as.numeric(credset_b38$chromosome)
annot$chromosome <- as.numeric(annot$chromosome)
credset_b38$posb38 <- as.numeric(credset_b38$posb38)
annot$posb38 <- as.numeric(annot$posb38)
credset_annot <- inner_join(credset_b38, annot, by=c("chromosome","posb38"))
credset_annot$credset <- "1"


gwas <- fread("/data/gen1/UKBiobank_500K/severe_asthma/Noemi_PhD/data/maf001_broad_pheno_1_5_ratio_betase_input_mungestat") %>% rename(chromosome=b37chr,posb37=bp)
gwas_credset_annot <- left_join(gwas,credset_annot,by=c("chromosome","posb37"))
gwas_credset_annot <- gwas_credset_annot %>% select(chromosome,posb37,pval,a1,a2,Functional_annotation,credset,snpid)
gwas_credset_annot <- gwas_credset_annot %>% mutate(Functional_annotation=ifelse(is.na(gwas_credset_annot$Functional_annotation), "unknown",gwas_credset_annot$Functional_annotation))
gwas_credset_annot$Functional_annotation <- as.factor(gwas_credset_annot$Functional_annotation)
gwas_credset_annot <- gwas_credset_annot %>% mutate(credset=ifelse(is.na(gwas_credset_annot$credset), 0, gwas_credset_annot$credset))
fwrite(gwas_credset_annot,"/scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot/gwas_credset_annot",quote=F,sep="\t",row.names=F,col.names=T)

src/Region_plot_V2G_2.R

#!/usr/bin/env Rscript


#Rationale: plot of fine-mapping PIP with functional annotation for variants in credible set:

library(tidyverse)
library(data.table)
library(dplyr)
library(RColorBrewer)
library(cowplot)
library(readxl)

args <- commandArgs(T)

r2_file <- args[1]
locus <- as.character(args[2])
locus_ggtitle <- as.character(args[3])
start <- as.numeric(args[4])
end <- as.numeric(args[5])
chr <- args[6]
snp_lead <- as.character(args[7])
print(start)
print(end)
print(chr)
print(snp_lead)
df <- fread("/scratch/gen1/nnp5/Var_to_Gen_tmp/regionalplot/gwas_credset_annot") %>% select(-snpid)
df <- df %>% mutate(snpid = paste0(chromosome,"_",posb37,"_",pmin(a1,a2),"_",pmax(a1,a2)))
df_sugg <- df %>% filter(pval <= 0.000005)
r2 <- fread(r2_file) %>% select(CHR_B,BP_B,SNP_B,R2)
colnames(r2) <- c("chromosome","posb37","snpid","R2")

df_r2 <- left_join(df,r2,by=c("chromosome","posb37","snpid"))

df_r2 <- df_r2 %>% mutate(Functional_annotation=ifelse(is.na(df_r2$Functional_annotation), "unknown",df_r2$Functional_annotation))
df_r2$Functional_annotation <- as.factor(df_r2$Functional_annotation)
df_r2$logP <- -log10(df$pval)

#for R2 color gradient in the plot:
myPalette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))
sc <- scale_colour_gradientn(colours = myPalette(100), limits=c(0, 1))

##Plot with functional annotation and R2:
annot_r2 <- df_r2 %>% ggplot(aes(posb37, logP, shape = Functional_annotation, colour = R2, label = snpid)) +
  geom_point(size=2) +
  scale_shape_manual(values=c("downstream"=15, "exonic"=16, "intergenic"=17, "intronic"=18, "ncRNA_exonic"=19, "ncRNA_intronic"=10, "unknown"=11,"upstream;downstream"=12, "UTR3"=13, "UTR5"=14)) +
  geom_point(data=df_r2[df_r2$credset==1,], pch=21, fill=NA, size=4, colour="red", stroke=1, alpha=0.5) +
  geom_text(aes(label=ifelse(R2>0.95,as.character(snpid),'')),hjust=0,vjust=0,size=2.5,colour="black") +
  theme_minimal() +
  theme(legend.position = "right") + sc +
  ggtitle(paste0(locus_ggtitle," (snp lead:", snp_lead,")")) + theme(plot.title = element_text(hjust=0.5)) + xlim(start,end) + ylab("-log10(pvalue)")


##Plot with functional annotation, R2 and gene location:
#Treated the gene plot as a gantt chart plot !
#prioritised genes and evidence:
l2g <- read_excel("src/report/locus2gene.xlsx",sheet = "L2G_clean") %>% filter(grepl(end,locus)) %>% select(gene)


#This gene list is from the R package LocusZooms
genes <- fread("/home/n/nnp5/software/LocusZooms/Gencode_GRCh37_Genes_UniqueList2021.txt")
#filter gene in the locus:
genes <- genes %>% filter(Chrom==paste0("chr",chr), Start >= start, End <= end)

## Set factor level to order the genes on the plot
genes$Gene <- as.factor(genes$Gene)
plot_gantt_gene <- qplot(ymin = Start,
                    ymax = End,
                    x = Gene,
                    colour = Coding,
                    geom = "linerange",
                    data = genes,
                    size = I(5)) +
    scale_colour_manual(values = c("lncRNA"="burlywood3", "ncRNA"="aquamarine4", "proteincoding"="chocolate", "pseudogene"="chocolate4")) +
    coord_flip() +
    theme_bw() +
    theme(panel.grid = element_blank()) +
    ylab("posb37") +
    xlab("genes") +
    ylim(start,end) + labs(title = paste0("V2G identified genes: ",l2g))


plot_grid(annot_r2 + theme(legend.justification = c(0,1)), plot_gantt_gene + theme(legend.justification = c(0,1)), ncol=1, align='v', rel_heights = c(2,1))
ggsave(locus, width = 50, height = 60, units = "cm")

Results and Conclusion

I identified a total of 98 genes. For each analysis:

  • Nearest gene identified 18 genes;

  • Functional annotation identified 40 genes (FANTOM 14 genes, Integrative scores 37 genes, ClinVar 6 genes);

  • eQTL: GTExV8 8 genes, eQTLgen blood 5 genes, UBC Lung eQTL 3 genes;

  • pQTL: UK Biobank 4 genes, SCALLOP 2 gene (no gene for deCODE);

  • PoPS 16 genes;

  • Mouse knock out 32 genes;

  • Rare disease 6 genes;

No genes were found from the look-up analyses in the exome single or gene-based rare variant analyses. Two variants (chr6_45913842_C_G,chr7_1489310_G_A) were significant, but not in the sentinel regions; they map to genes TTK and INTS1 respectively.
Full variant-to-gene results available here:

Download var2gene_full.xlsx

Out of the 99 genes, 24 genes were supported by at least 2 criteria (Figure 2):

  • 16 genes by two criteria: AC034102.4,AC034114.2,AC044784.1,AC116366.3,CAMK4,HDAC7,HLA-DQA1,HLA-DRB1,HLA-DRB5,IKZF3,IL33,NR1D1,ORMDL3,PGAP3,PPP1R1B,SUOX

  • 6 genes by three criteria: D2HGDH,IL4R,NSMCE1,RORA,RPS26,SMAD3

  • 2 genes by four criteria: BACH2, IL1RL1

## Locus-to-gene Below, the region-plots with linkage-disequilibrium measures wth regards to the variant with highest posterior inclusion probability as by the fine-mapping analysis with gene-mapped in the genomic locus.